@inproceedings{gebara2021panama,
author = {Gebara, Nadeen and Costa, Paolo and Ghobadi, Manya},
title = {PANAMA: In-network  Aggregation for Shared Machine Learning Clusters},
booktitle = {Conference on Machine Learning and Systems (MLSys) 2021},
year = {2021},
month = {April},
abstract = {We present PANAMA, a novel in-network aggregation framework for distributed machine learning (ML) training on shared clusters serving a variety of jobs. PANAMA comprises two key components: (i) a custom in-network hardware accelerator that can support floating-point gradient aggregation at line rate without compromising accuracy; and (ii) a lightweight load-balancing and congestion control protocol that exploits the unique communication patterns of ML data-parallel jobs to enable fair sharing of network resources across different jobs while ensuring high throughput for long-running jobs and low latency for short jobs and other latency-sensitive traffic. We evaluate the feasibility of PANAMA using an FPGA-based prototype with 10~Gbps transceivers and large-scale simulations. Our simulation results demonstrate that PANAMA decreases the average training time of large jobs by up to a factor of 1.34. More importantly, by drastically decreasing the load placed on the network by large data-parallel jobs, PANAMA provides significant benefits to non-aggregation flows too, especially latency-sensitive short flows, reducing their 99%-tile completion time by up to 4.5x.},
url = {http://approjects.co.za/?big=en-us/research/publication/panama-in-network-aggregation-for-shared-machine-learning-clusters/},
}