@inproceedings{yan2021fastseq,
author = {Yan, Yu and Hu, Fei and Chen, Jiusheng and Bhendawade, Nikhil and Ye, Ting and Gong, Yeyun and Duan, Nan and Cui, Desheng and Chi, Bingyu and Zhang, Ruofei},
title = {FastSeq: Make Sequence Generation Faster},
booktitle = {2021 Meeting of the Association for Computational Linguistics},
year = {2021},
month = {August},
abstract = {Transformer-based models have made tremendous impacts in natural language generation. However, the inference speed is a bottleneck due to large model size and intensive computing involved in auto-regressive decoding process. We develop FastSeq framework to accelerate sequence generation without accuracy loss. The proposed optimization techniques include an attention cache optimization, an efficient algorithm for detecting repeated n-grams, and an asynchronous generation pipeline with parallel I/O. These optimizations are general enough to be applicable to Transformer-based models (e.g., T5, GPT2, and UniLM). Our benchmark results on a set of widely used and diverse models demonstrate 4-9x inference speed gain. Additionally, FastSeq is easy to use with a simple one-line code change. The source code is available at https://github.com/microsoft/fastseq.},
url = {http://approjects.co.za/?big=en-us/research/publication/fastseq-make-sequence-generation-faster/},
}