@inproceedings{shen2019nexus, author = {Shen, Haichen and Chen, Lequn and Jin, Yuchen and Zhao, Liangyu and Kong, Bingyu and Philipose, Matthai and Krishnamurthy, Arvind and Sundaram, Ravi}, title = {Nexus: A GPU Cluster Engine for Accelerating DNN-Based Video Analysis}, booktitle = {SOSP '19: Proceedings of the 27th ACM Symposium on Operating Systems Principles}, year = {2019}, month = {October}, abstract = {We address the problem of serving Deep Neural Networks (DNNs) efficiently from a cluster of GPUs. In order to realize the promise of very low-cost processing made by accelerators such as GPUs, it is essential to run them at sustained high utilization. Doing so requires cluster-scale resource management that performs detailed scheduling of GPUs, reasoning about groups of DNN invocations that need to be coscheduled, and moving from the conventional whole-DNN execution model to executing fragments of DNNs. Nexus is a fully implemented system that includes these innovations. In large-scale case studies on 16 GPUs, when required to stay within latency constraints at least 99% of the time, Nexus can process requests at rates 1.8-12.7× higher than state of the art systems can. A long-running multi-application deployment stays within 84% of optimal utilization and, on a 100-GPU cluster, violates latency SLOs on 0.27% of requests.}, url = {http://approjects.co.za/?big=en-us/research/publication/nexus-a-gpu-cluster-engine-for-accelerating-dnn-based-video-analysis/}, pages = {322-337}, }