@inproceedings{wang2024zero,
author = {Wang, Guanhua and Qin, Heyang and Ade Jacobs, Sam and Wu, Xiaoxia and Holmes, Connor and Yao, Zhewei and Rajbhandari, Samyam and Ruwase, Olatunji and Yang, Feng and Yang, Lei and He, Yuxiong},
title = {ZeRO++: Extremely Efficient Collective Communication for Large Model Training},
booktitle = {ICLR 2024},
year = {2024},
month = {March},
abstract = {While the Zero Redundancy Optimizer (ZeRO) excels in training large-scale models, it struggles to achieve good throughput in environments with limited band-width or small batches where communication becomes a major bottleneck. Inspired by the principles of fine-grained quantization in machine learning algorithms, we designed ZeRO++, an optimizer robust to quantization effects that allows for significant communication volume reduction using low-precision quantization techniques. ZeRO++ composes of three communication volume reduction techniques (low-precision all-gather, data remapping, and low-precision gradient averaging) to significantly reduce the communication volume up to 4x that enables up to 2.16x better throughput at 384 GPU scale. Our results also show ZeRO++ can speedup the RLHF by 3.3x compared to vanilla ZeRO. To verify the convergence of ZeRO++, we test up to 13B model for pretraining with 8/6-bits all gather and up to 30B model for finetuning with 4-bit or 2-bit all gather, and demonstrate on-par accuracy as original ZeRO (aka standard training). As a byproduct, the model trained with ZeRO++ is weight-quantized, which can be directly used for inference without post-training quantization or quantization-aware training.},
url = {http://approjects.co.za/?big=en-us/research/publication/zero-extremely-efficient-collective-communication-for-large-model-training/},
}