@inproceedings{cutler2020multimodal, author = {Cutler, Ross and Mehran, Ramin and Johnson, Sam and Zhang, Cha and Kirk, Adam and Whyte, Oliver and Kowdle, Adarsh}, title = {Multimodal Active Speaker Detection and Virtual Cinematography for Video Conferencing}, booktitle = {2020 International Conference on Acoustics, Speech, and Signal Processing}, year = {2020}, month = {May}, abstract = {Active speaker detection (ASD) and virtual cinematography (VC) can significantly improve the experience of a video conference by automatically panning, tilting and zooming of a camera: subjectively users rate an expert video cinematographer significantly higher than the unedited video. We describe a new automated ASD and VC that performs within 0.3 MOS of an expert cinematographer based on subjective ratings with a 1-5 scale. This system uses a 4K wide-FOV camera, a depth camera, and a microphone array, extracts features from each modality and trains an ASD using an AdaBoost machine learning system that is very efficient and runs in real-time. A VC is similarly trained using machine learning. To avoid distracting the room participants the system has no moving parts – the VC works by cropping and zooming the 4K wide-FOV video stream. The system was tuned and evaluated using extensive crowdsourcing techniques and evaluated on a system with N=100 meetings, each 25 minutes in length.}, publisher = {IEEE}, url = {http://approjects.co.za/?big=en-us/research/publication/multimodal-active-speaker-detection-and-virtual-cinematography-for-video-conferencing/}, pages = {4527-4531}, }