@article{chung2018serving, author = {Chung, Eric and Fowers, Jeremy and Ovtcharov, Kalin and Papamichael, Michael and Caulfield, Adrian and Massengill, Todd and Liu, Ming and Ghandi, Mahdi and Lo, Daniel and Reinhardt, Steve and Alkalay, Shlomi and Angepat, Hari and Chiou, Derek and Forin, Alessandro and Burger, Doug and Woods, Lisa and Weisz, Gabriel and Haselman, Michael and Zhang, Dan}, title = {Serving DNNs in Real Time at Datacenter Scale with Project Brainwave}, year = {2018}, month = {March}, abstract = {To meet the computational demands required of deep learning, cloud operators are turning toward specialized hardware for improved efficiency and performance. Project Brainwave, Microsoft's principal infrastructure for AI serving in real time, accelerates deep neural network (DNN) inferencing in major services such as Bing’s intelligent search features  and Azure. Exploiting distributed model parallelism and pinning over low-latency hardware microservices, Project Brainwave serves state-of-the-art, pre-trained DNN models with high efficiencies at low batch sizes. A high-performance, precision-adaptable FPGA soft processor is at the heart of the system, achieving up to 39.5 TFLOPs of effective performance at Batch 1 on a state-of-the-art Intel Stratix 10 FPGA.}, url = {http://approjects.co.za/?big=en-us/research/publication/serving-dnns-real-time-datacenter-scale-project-brainwave/}, pages = {8-20}, journal = {IEEE Micro}, volume = {38}, }