@inproceedings{narayanan2018accelerating,
author = {Narayanan, Deepak and Santhanam, Keshav and Phanishayee, Amar and Zaharia, Matei},
title = {Accelerating Deep Learning Workloads Through Efficient Multi-Model Execution},
booktitle = {NeurIPS Workshop on Systems for Machine Learning},
year = {2018},
month = {December},
abstract = {Deep neural networks (DNNs) with millions of parameters are increasingly being used in a variety of domains. To keep pace with this growing computational demand, GPUs have become progressively more powerful. However, many multi-model workloads are not able to leverage the available computational capacity. For example, model search applications uses smaller models to automatically design model architectures for a given task, and low-latency model serving applications use a small minibatch size. We show that the natural baseline of simply launching GPU operations from different models in parallel fails to provide substantial speedups due to data transfer, memory-bound kernels, and the overhead of kernel launches for short-duration kernels. We propose HiveMind, a system that optimizes multi-model deep learning workloads through several techniques. HiveMind optimizes a “model batch” by performing cross-model operator fusion, and sharing I/O across models. HiveMind then uses a parallel runtime to efficiently execute this fused graph. Preliminary results show HiveMind can accelerate simple hyperparameter tuning and multi-model inference workloads by up to 10x on NVIDIA P100 and V100 GPUs compared to sequential model execution.},
url = {http://approjects.co.za/?big=en-us/research/publication/accelerating-deep-learning-workloads-through-efficient-multi-model-execution/},
}