@inproceedings{soifer2019deep, author = {Soifer, Jonathan and Li, Jason and Li, Mingqin and Zhu, Jeffrey and Li, Yingnan and He, Yuxiong and Zheng, Elton and Oltean, Adi and Mosyak, Maya and Barnes, Chris and Liu, Thomas and Wang, Junhua }, title = {Deep Learning Inference Service at Microsoft}, booktitle = {2019 USENIX Conference on Operational Machine Learning (OpML ’19)}, year = {2019}, month = {May}, abstract = {This paper introduces the Deep Learning Inference Service, an online production service at Microsoft for ultra-low-latency deep neural network model inference. We present the system architecture and deep dive into core concepts such as intelligent model placement, heterogeneous resource management, resource isolation, and efficient routing. We also present production scale and performance numbers.}, url = {http://approjects.co.za/?big=en-us/research/publication/deep-learning-inference-service-at-microsoft/}, }