@inproceedings{qin2025scaling,
author = {Qin,  Zeyu and Dong, Qingxiu and Zhang, Xingxing and Dong, Li and Huang, Xiaolong and Yang, Ziyi and Khademi, Mahmoud and Zhang, Dongdong and Awadalla,  Hany Hassan and Fung, Yi R. and Chen, Weizhu and Cheng, Minhao and Wei, Furu},
title = {Scaling Laws of Synthetic Data for Language Models},
booktitle = {COLM 2025},
year = {2025},
month = {July},
abstract = {Large language models (LLMs) achieve strong performance across diverse tasks, driven by high-quality web data used in pre-training. However, recent studies indicate web data is rapidly depleting. Synthetic data emerges as a promising alternative, but it remains unclear whether synthetic datasets exhibit predictable scalability comparable to raw pre-training data. In this work, we systematically investigate scaling laws of synthetic data by introducing SynthLLM, a scalable framework that transforms pre-training corpora into diverse, high-quality synthetic datasets. Our approach achieves this by automatically extracting and recombining high-level concepts across multiple documents using a graph algorithm. Key findings from our experiments with SynthLLM on math domain include: (1) SynthLLM generates synthetic data that reliably adheres to rectified scaling law across various model sizes; (2) Performance gains gradually diminish near 300B tokens; and (3) Larger models approach optimal performance with fewer training tokens. For instance, an 8B model peaks at 1T tokens, while a 3B model requires 4T. Moreover, comparisons with existing synthetic data generation and augmentation methods demonstrate that SynthLLM achieves superior performance and scalability. Our findings highlight synthetic data as a scalable and reliable alternative to raw pre-training data, offering a viable path toward continued improvement in model performance.},
url = {http://approjects.co.za/?big=en-us/research/publication/scaling-laws-of-synthetic-data-for-language-models/},
}