@inproceedings{jojic2004audio-visual,
author = {Jojic, Nebojsa},
title = {Audio-Visual Graphical Models for Speech Processing},
booktitle = {Proc. of the Int. Conf. on Acoustics, Speech, and Signal Processing},
year = {2004},
month = {May},
abstract = {Perceiving sounds in a noisy environment is a challenging problem. Visual lip-reading can provide relevant information but is also challenging because lips are moving and a tracker must deal with a variety of conditions. Typically audio-visual systems have been assembled from individually engineered modules. We propose to fuse audio and video in a probabilistic generative model that implements cross-model self-supervised learning, enabling adaptation to audio-visual data. The video model features a Gaussian mixture model embedded in a linear subspace of a sprite which translates in the video. The system can learn to detect and enhance speech in noise given only a short (30 second) sequence of audio-visual data. We show some results for speech detection and enhancement, and discuss extensions to the model that are under investigation.},
url = {http://approjects.co.za/?big=en-us/research/publication/audio-visual-graphical-models-for-speech-processing/},
edition = {Proc. of the Int. Conf. on Acoustics, Speech, and Signal Processing},
}