@inproceedings{yella2014artificial,
author = {Yella, Sree Harsha and Stolcke, Andreas and Slaney, Malcolm},
title = {Artificial Neural Network Features for Speaker Diarization},
booktitle = {Proc. IEEE Spoken Language Technology Workshop},
year = {2014},
month = {December},
abstract = {Speaker diarization finds contiguous speaker segments in an audio recording and clusters them by speaker identity, without any a-priori knowledge. Diarization is typically based on short-term spectral features such as Mel-frequency cepstral coefficients (MFCCs). Though these features carry average information about the vocal tract characteristics of a speaker, they are also susceptible to factors unrelated to the speaker identity. In this study, we propose an artificial neural netw ork (ANN) architecture to learn a feature transform that is optimized for speaker diarization. We train a multi-hidden-layer ANN to judge whether two given speech segments came from the same or different speakers, using a shared transform of the input features that feeds into a bottleneck layer. We then use the bottleneck layer activations as features, either alone or in combination with baseline MFCC features in a multistream mode, for speaker diarization on test data. The resulting system is evaluated on various corpora of multi-party meetings. A combination of MFCC and ANN features gives up to 14% relative reduction in diarization error, demonstrating that these features are providing an additional independent source of knowledge.},
publisher = {IEEE - Institute of Electrical and Electronics Engineers},
url = {http://approjects.co.za/?big=en-us/research/publication/artificial-neural-network-features-for-speaker-diarization/},
pages = {402-406},
edition = {Proc. IEEE Spoken Language Technology Workshop},
}