@article{yu2008robust,
author = {Yu, Dong and Deng, Li and Droppo, Jasha and Wu, Jian and Gong, Yifan and Acero, Alex and Tashev, Ivan and Seltzer, Mike},
title = {Robust speech recognition using cepstral minimum-mean-square-error noise suppressor},
year = {2008},
month = {July},
abstract = {We present an efficient and effective nonlinear feature-domain noise suppression algorithm, motivated by the minimum mean-square-error (MMSE) optimization criterion, for noiserobust speech recognition. Distinguishing from the log-MMSE spectral amplitude noise suppressor proposed by Ephraim and Malah (E&M), our new algorithm is aimed to minimize the error expressed explicitly for the Mel-frequency cepstra instead of discrete Fourier transform (DFT) spectra, and it operates on the Mel-frequency filter bank’s output. As a consequence, the statistics used to estimate the suppression factor become vastly different from those used in the E&M log-MMSE suppressor. Our algorithm is significantly more efficient than the E&M’s log-MMSE suppressor since the number of the channels in the Mel-frequency filter bank is much smaller (23 in our case) than the number of bins (256) in DFT.We have conducted extensive speech recognition experiments on the standard Aurora-3 task. The experimental results demonstrate a reduction of the recognition word error rate by 48% over the standard ICSLP02 baseline, 26% over the cepstral mean normalization baseline, and 13% over the popular E&M’s log-MMSE noise suppressor. The experiments also show that our new algorithm performs slightly better than the ETSI advanced front end (AFE) on the well-matched and mid-mismatched settings, and has 8% and 10% fewer errors than our earlier SPLICE (stereo-based piecewise linear compensation for environments) system on these settings, respectively.},
publisher = {Institute of Electrical and Electronics Engineers, Inc.},
url = {http://approjects.co.za/?big=en-us/research/publication/robust-speech-recognition-using-cepstral-minimum-mean-square-error-noise-suppressor/},
journal = {IEEE Trans. Audio, Speech, and Language Processing},
volume = {16},
number = {5},
}