@inproceedings{seide2011conversational,
author = {Seide, Frank and Li, Gang and Yu, Dong},
title = {Conversational Speech Transcription Using Context-Dependent Deep Neural Networks},
booktitle = {Interspeech 2011},
year = {2011},
month = {August},
abstract = {We apply the recently proposed Context-Dependent Deep- Neural-Network HMMs, or CD-DNN-HMMs, to speech-to-text transcription. For single-pass speaker-independent recognition on the RT03S Fisher portion of phone-call transcription benchmark (Switchboard), the word-error rate is reduced from 27.4%, obtained by discriminatively trained Gaussian-mixture HMMs, to 18.5%?aa 33% relative improvement. CD-DNN-HMMs combine classic artificial-neural-network HMMs with traditional tied-state triphones and deep-beliefnetwork pre-training. They had previously been shown to reduce errors by 16% relatively when trained on tens of hours of data using hundreds of tied states. This paper takes CD-DNNHMMs further and applies them to transcription using over 300 hours of training data, over 9000 tied states, and up to 9 hidden layers, and demonstrates how sparseness can be exploited. On four less well-matched transcription tasks, we observe relative error reductions of 22¨C28%.},
publisher = {International Speech Communication Association},
url = {http://approjects.co.za/?big=en-us/research/publication/conversational-speech-transcription-using-context-dependent-deep-neural-networks/},
edition = {Interspeech 2011},
}