@inproceedings{yella2014artificial, author = {Yella, Sree Harsha and Stolcke, Andreas and Slaney, Malcolm}, title = {Artificial Neural Network Features for Speaker Diarization}, booktitle = {Proc. IEEE Spoken Language Technology Workshop}, year = {2014}, month = {December}, abstract = {Speaker diarization finds contiguous speaker segments in an audio recording and clusters them by speaker identity, without any a-priori knowledge. Diarization is typically based on short-term spectral features such as Mel-frequency cepstral coefficients (MFCCs). Though these features carry average information about the vocal tract characteristics of a speaker, they are also susceptible to factors unrelated to the speaker identity. In this study, we propose an artificial neural netw ork (ANN) architecture to learn a feature transform that is optimized for speaker diarization. We train a multi-hidden-layer ANN to judge whether two given speech segments came from the same or different speakers, using a shared transform of the input features that feeds into a bottleneck layer. We then use the bottleneck layer activations as features, either alone or in combination with baseline MFCC features in a multistream mode, for speaker diarization on test data. The resulting system is evaluated on various corpora of multi-party meetings. A combination of MFCC and ANN features gives up to 14% relative reduction in diarization error, demonstrating that these features are providing an additional independent source of knowledge.}, publisher = {IEEE - Institute of Electrical and Electronics Engineers}, url = {http://approjects.co.za/?big=en-us/research/publication/artificial-neural-network-features-for-speaker-diarization/}, pages = {402-406}, edition = {Proc. IEEE Spoken Language Technology Workshop}, }