@inproceedings{sapio2021scaling,
author = {Sapio, Amedeo and Canini, Marco and Ho, Chen-Yu and Nelson, Jacob and Kalnis, Panos and Kim, Changhoon and Krishnamurthy, Arvind and Moshref, Masoud and Ports, Dan R. K. and Richtarik, Peter},
title = {Scaling Distributed Machine Learning with In-Network Aggregation},
organization = {USENIX},
booktitle = {NSDI 2021},
year = {2021},
month = {April},
abstract = {Training machine learning models in parallel is an increasingly important workload. We accelerate distributed parallel training by designing a communication primitive that uses a programmable switch dataplane to execute a key step of the training process. Our approach, SwitchML, reduces the volume of exchanged data by aggregating the model updates from multiple workers in the network. We co-design the switch processing with the end-host protocols and ML frameworks to provide an efficient solution that speeds up training by up to 5.5x for a number of real-world benchmark models.},
url = {http://approjects.co.za/?big=en-us/research/publication/scaling-distributed-machine-learning-with-in-network-aggregation-2/},
}