@inproceedings{hongyuzhuand2022roller,
author = {, Hongyu Zhu and , Ruofan Wu and , Yijia Diao and , Shanbin Ke and , Haoyu Li and , Chen Zhang and Xue, Jilong and Ma, Lingxiao and Xia, Yuqing and Cui, Wei and Yang, Fan and Yang, Mao and Zhou, Lidong and , Asaf Cidon and , Gennady Pekhimenko},
title = {Roller: Fast and Efficient Tensor Compilation for Deep Learning},
booktitle = {The 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22)},
year = {2022},
month = {July},
abstract = {Despite recent advances in tensor compilers, it often costs hours to generate an efficient kernel for an operator, a compute-intensive sub-task in a deep neural network (DNN), on various accelerators (e.g., GPUs). This significantly slows down DNN development cycles and incurs heavy burdens on the development of general kernel libraries and custom kernels, especially for new hardware vendors. The slow compilation process is due to the large search space formulated by existing DNN compilers, which have to use machine learning algorithms to find good solutions.

In this paper, we present ROLLER, which takes a different construction-based approach to generate kernels. At the core of ROLLER is rTile, a new tile abstraction that encapsulates tensor shapes that align with the key features of the underlying accelerator, thus achieving efficient execution by limiting the shape choices. ROLLER then adopts a recursive rTile-based construction algorithm to generate rTile-based programs (rProgram), whose performance can be evaluated efficiently with a micro-performance model without being evaluated in a real device. As a result, ROLLER can generate efficient kernels in seconds, with comparable performance to the state-of-the-art solutions on popular accelerators like GPUs, while offering better kernels on less mature accelerators like IPUs.},
url = {http://approjects.co.za/?big=en-us/research/publication/roller-fast-and-efficient-tensor-compilation-for-deep-learning/},
}