@inproceedings{zhang2021denoising,
author = {Zhang, Chen and Ren, Yi and Tan, Xu and Liu, Jinglin and Zhang, Kejun and Qin, Tao and Zhao, Sheng and Liu, Tie-Yan},
title = {Denoising Text to Speech with Frame-Level Noise Modeling},
booktitle = {ICASSP 2021},
year = {2021},
month = {January},
abstract = {While neural-based text to speech (TTS) models can synthesize natural and intelligible voice, they usually require high-quality speech data, which is costly to collect. In many scenarios, only noisy speech of a target speaker is available, which presents challenges for TTS model training for this speaker. Previous works usually address the challenge using two methods: 1) training the TTS model using the speech denoised with an enhancement model; 2) taking a single noise embedding as input when training with noisy speech. However, they usually cannot handle speech with real-world complicated noise such as those with high variations along time. In this paper, we develop DenoiSpeech, a TTS system that can synthesize clean speech for a speaker with noisy speech data. In DenoiSpeech, we handle real-world noisy speech by modeling the fine-grained frame-level noise with a noise condition module, which is jointly trained with the TTS model. Experimental results on real-world data show that DenoiSpeech outperforms the previous two methods by 0.31 and 0.66 MOS respectively.},
url = {http://approjects.co.za/?big=en-us/research/publication/denoising-text-to-speech-with-frame-level-noise-modeling/},
}