@techreport{sapio2019scaling, author = {Sapio, Amadeo and Canini, Marco and Ho, Chen-Yu and Nelson, Jacob and Kalnis, Panos and Kim, Changhoon and Krishnamurthy, Arvind and Moshref, Masoud and Ports, Dan R. K. and Richtarik, Peter}, title = {Scaling Distributed Machine Learning with In-Network Aggregation}, institution = {KAUST}, year = {2019}, month = {February}, abstract = {Training complex machine learning models in parallel is an increasingly important workload. We accelerate distributed parallel training by designing a communication primitive that uses a programmable switch dataplane to execute a key step of the training process. Our approach, SwitchML, reduces the volume of exchanged data by aggregating the model updates from multiple workers in the network. We co-design the switch processing with the end-host protocols and ML frameworks to provide a robust, efficient solution that speeds up training by up to 300%, and at least by 20% for a number of real-world benchmark models.}, url = {http://approjects.co.za/?big=en-us/research/publication/scaling-distributed-machine-learning-with-in-network-aggregation/}, number = {MSR-TR-2019-9}, }