@techreport{xiong2017the,
author = {Xiong, Wayne and Wu, Lingfeng and Alleva, Fil and Droppo, Jasha and Huang, Xuedong and Stolcke, Andreas},
title = {The Microsoft 2017 Conversational Speech Recognition System [Technical Report]},
year = {2017},
month = {August},
abstract = {We describe the 2017 version of Microsoft's conversational speech recognition system, in which we update our 2016 system with recent developments in neural-network-based acoustic and language modeling to further advance the state of the art on the Switchboard speech recognition task.  The system adds a CNN-BLSTM acoustic model to the set of model architectures we combined previously, and includes character-based and dialog session aware LSTM language models in rescoring.  For system combination we adopt a two-stage approach, whereby subsets of acoustic models are first combined at the senone/frame level, followed by a word-level voting via confusion networks.  We also added a confusion network rescoring step after system combination. The resulting system yields a 5.1% word error rate on the 2000 Switchboard evaluation set.},
url = {http://approjects.co.za/?big=en-us/research/publication/microsoft-2017-conversational-speech-recognition-system/},
number = {MSR-TR-2017-39},
}