@inproceedings{liu2024rethinking,
author = {Liu, Xuting and Arzani, Behnaz and Kakarla, Siva Kesava Reddy and Zhao, Liangyu and Liu, Vincent and Castro, Miguel and Kandula, Srikanth and Marshall, Luke},
title = {Rethinking Machine Learning Collective Communication as a Multi Commodity Flow problem},
organization = {ACM},
booktitle = {SIGCOMM},
year = {2024},
month = {August},
abstract = {Cloud operators utilize collective communication optimizers to enhance the efficiency of the single-tenant, centrally managed training clusters they manage. However, current optimizers struggle to scale for such use cases and often compromise solution quality for scalability. Our solution, TE-CCL, adopts a traffic-engineering-based approach to collective communication. Compared to a state-of-the-art optimizer, TACCL, TE-CCL produced schedules with 2× better performance on topologies TACCL supports (and its solver took a similar amount of time as TACCL's heuristic-based approach). TECCL additionally scales to larger topologies than TACCL. On our GPU testbed, TE-CCL outperformed TACCL by 2.14× and RCCL by 3.18× in terms of algorithm bandwidth.},
url = {http://approjects.co.za/?big=en-us/research/publication/rethinking-machine-learning-collective-communication-as-a-multi-commodity-flow-problem/},
}