@article{kumatani2021building,
author = {Kumatani, Kenichi and Gmyr, Robert and Cruz Salinas, Felipe  and Liu, Linquan and , Wei Zuo and Patel, Devang and Sun, Eric and Shi, Yu},
title = {Building a great multi-lingual teacher with sparsely-gated mixture of experts for speech recognition},
year = {2021},
month = {December},
abstract = {The sparsely-gated Mixture of Experts (MoE) can magnify a network capacity with a little computational complexity. In this work, we investigate how multi-lingual Automatic Speech Recognition (ASR) networks can be scaled up with a simple routing algorithm in order to achieve better accuracy. More specifically, we apply the sparsely-gated MoE technique to two types of networks: Sequence-to-Sequence Transformer (S2S-T) and Transformer Transducer (T-T). We demonstrate through a set of ASR experiments on multiple language data that the MoE networks can reduce the relative word error rates by 16.3% and 4.6% with the S2S-T and T-T, respectively. Moreover, we thoroughly investigate the effect of the MoE on the T-T architecture in various conditions: streaming mode, non-streaming mode, the use of language ID and the label decoder with the MoE.},
url = {http://approjects.co.za/?big=en-us/research/publication/deep-learning-mixture-of-expert-multi-lingual-speech-recognition/},
journal = {CoRR},
volume = {abs/2112.05820},
}