@inproceedings{gong2024not,
author = {Gong, Yeyun and Liu, Xiao and Shen, Yelong and Xu, Ruochen and Jiao, Jian and Duan, Nan and Chen, Weizhu},
title = {Not All Tokens Are What You Need for Pretraining},
booktitle = {2024 Neural Information Processing Systems},
year = {2024},
month = {October},
abstract = {Previous language model pre-training methods have uniformly applied a next-token prediction loss to all training tokens. Challenging this norm, we posit that "Not all tokens in a corpus are equally important for language model training". Our initial analysis examines token-level training dynamics of language model, revealing distinct loss patterns for different tokens. Leveraging these insights, we introduce a new language model called Rho-1. Unlike traditional LMs that learn to predict every next token in a corpus, Rho-1 employs Selective Language Modeling (SLM), which selectively trains on useful tokens that aligned with the desired distribution. This approach involves scoring pretraining tokens using a reference model, and then training the language model with a focused loss on tokens with higher scores. When continual pretraining on 15B OpenWebMath corpus, Rho-1 yields an absolute improvement in few-shot accuracy of up to 30% in 9 math tasks. After fine-tuning, Rho-1-1B and 7B achieved state-of-the-art results of 40.6% and 51.8% on MATH dataset, respectively - matching DeepSeekMath with only 3% of the pretraining tokens. Furthermore, when pretraining on 80B general tokens, Rho-1 achieves 6.8% average enhancement across 15 diverse tasks, increasing both efficiency and performance of the language model pre-training.},
url = {http://approjects.co.za/?big=en-us/research/publication/not-all-tokens-are-what-you-need-for-pretraining/},
}