@inproceedings{sapio2021scaling, author = {Sapio, Amedeo and Canini, Marco and Ho, Chen-Yu and Nelson, Jacob and Kalnis, Panos and Kim, Changhoon and Krishnamurthy, Arvind and Moshref, Masoud and Ports, Dan R. K. and Richtarik, Peter}, title = {Scaling Distributed Machine Learning with In-Network Aggregation}, organization = {USENIX}, booktitle = {NSDI 2021}, year = {2021}, month = {April}, abstract = {Training machine learning models in parallel is an increasingly important workload. We accelerate distributed parallel training by designing a communication primitive that uses a programmable switch dataplane to execute a key step of the training process. Our approach, SwitchML, reduces the volume of exchanged data by aggregating the model updates from multiple workers in the network. We co-design the switch processing with the end-host protocols and ML frameworks to provide an efficient solution that speeds up training by up to 5.5x for a number of real-world benchmark models.}, url = {http://approjects.co.za/?big=en-us/research/publication/scaling-distributed-machine-learning-with-in-network-aggregation-2/}, }