@inproceedings{yan2021fastseq, author = {Yan, Yu and Hu, Fei and Chen, Jiusheng and Bhendawade, Nikhil and Ye, Ting and Gong, Yeyun and Duan, Nan and Cui, Desheng and Chi, Bingyu and Zhang, Ruofei}, title = {FastSeq: Make Sequence Generation Faster}, booktitle = {2021 Meeting of the Association for Computational Linguistics}, year = {2021}, month = {August}, abstract = {Transformer-based models have made tremendous impacts in natural language generation. However, the inference speed is a bottleneck due to large model size and intensive computing involved in auto-regressive decoding process. We develop FastSeq framework to accelerate sequence generation without accuracy loss. The proposed optimization techniques include an attention cache optimization, an efficient algorithm for detecting repeated n-grams, and an asynchronous generation pipeline with parallel I/O. These optimizations are general enough to be applicable to Transformer-based models (e.g., T5, GPT2, and UniLM). Our benchmark results on a set of widely used and diverse models demonstrate 4-9x inference speed gain. Additionally, FastSeq is easy to use with a simple one-line code change. The source code is available at https://github.com/microsoft/fastseq.}, url = {http://approjects.co.za/?big=en-us/research/publication/fastseq-make-sequence-generation-faster/}, }