@inproceedings{hashimoto2021model, author = {Hashimoto, Tatsunori}, title = {Model Performance Scaling with Multiple Data Sources}, booktitle = {ICML 2021}, year = {2021}, month = {July}, abstract = {Real-world machine learning systems are often trained using a mix of data sources with varying cost and quality. Understanding how the size and composition of a training dataset affect model performance is critical for advancing our understanding of generalization, as well as designing more effective data collection policies. We show that there is a simple scaling law that predicts the loss incurred by a model even under varying dataset composition. Our work expands recent observations of scaling laws for log-linear generalization error in the i.i.d setting and uses this to cast model performance prediction as a learning problem. Using the theory of optimal experimental design, we derive a simple rational function approximation to generalization error that can be fitted using a few model training runs. Our approach can achieve highly accurate (r2 ≈ .9) predictions of model performance under substantial extrapolation in two different standard supervised learning tasks and is accurate (r2 ≈ .83) on more challenging machine translation and question answering tasks where many baselines achieve worse-than-random performance.}, publisher = {PMLR}, url = {http://approjects.co.za/?big=en-us/research/publication/model-performance-scaling-with-multiple-data-sources/}, pages = {4107-4116}, }