@inproceedings{gebara2021panama, author = {Gebara, Nadeen and Costa, Paolo and Ghobadi, Manya}, title = {PANAMA: In-network Aggregation for Shared Machine Learning Clusters}, booktitle = {Conference on Machine Learning and Systems (MLSys) 2021}, year = {2021}, month = {April}, abstract = {We present PANAMA, a novel in-network aggregation framework for distributed machine learning (ML) training on shared clusters serving a variety of jobs. PANAMA comprises two key components: (i) a custom in-network hardware accelerator that can support floating-point gradient aggregation at line rate without compromising accuracy; and (ii) a lightweight load-balancing and congestion control protocol that exploits the unique communication patterns of ML data-parallel jobs to enable fair sharing of network resources across different jobs while ensuring high throughput for long-running jobs and low latency for short jobs and other latency-sensitive traffic. We evaluate the feasibility of PANAMA using an FPGA-based prototype with 10~Gbps transceivers and large-scale simulations. Our simulation results demonstrate that PANAMA decreases the average training time of large jobs by up to a factor of 1.34. More importantly, by drastically decreasing the load placed on the network by large data-parallel jobs, PANAMA provides significant benefits to non-aggregation flows too, especially latency-sensitive short flows, reducing their 99%-tile completion time by up to 4.5x.}, url = {http://approjects.co.za/?big=en-us/research/publication/panama-in-network-aggregation-for-shared-machine-learning-clusters/}, }