@inproceedings{liu2024rethinking, author = {Liu, Xuting and Arzani, Behnaz and Kakarla, Siva Kesava Reddy and Zhao, Liangyu and Liu, Vincent and Castro, Miguel and Kandula, Srikanth and Marshall, Luke}, title = {Rethinking Machine Learning Collective Communication as a Multi Commodity Flow problem}, organization = {ACM}, booktitle = {SIGCOMM}, year = {2024}, month = {August}, abstract = {Cloud operators utilize collective communication optimizers to enhance the efficiency of the single-tenant, centrally managed training clusters they manage. However, current optimizers struggle to scale for such use cases and often compromise solution quality for scalability. Our solution, TE-CCL, adopts a traffic-engineering-based approach to collective communication. Compared to a state-of-the-art optimizer, TACCL, TE-CCL produced schedules with 2× better performance on topologies TACCL supports (and its solver took a similar amount of time as TACCL's heuristic-based approach). TECCL additionally scales to larger topologies than TACCL. On our GPU testbed, TE-CCL outperformed TACCL by 2.14× and RCCL by 3.18× in terms of algorithm bandwidth.}, url = {http://approjects.co.za/?big=en-us/research/publication/rethinking-machine-learning-collective-communication-as-a-multi-commodity-flow-problem/}, }