@inproceedings{chen2024knowledge,
author = {Chen, Tuochao and Itani, Malek and Srinivas, Vidya and Eskimez, Sefik Emre and Yoshioka, Takuya and Gollakota, Shyamnath},
title = {Knowledge boosting during low-latency inference},
organization = {ISCA},
booktitle = {Interspeech 2024},
year = {2024},
month = {September},
abstract = {Models for low-latency, streaming applications could benefit from the knowledge capacity of larger models, but edge devices cannot run these models due to resource constraints. A possible solution is to transfer hints during inference from a large model running remotely to a small model running on-device. However, this incurs a communication delay that breaks real-time requirements and does not guarantee that both models will operate on the same data at the same time. We propose knowledge boosting, a novel technique that allows a large model to operate on time-delayed input during inference, while still boosting small model performance. Using a streaming neural network that processes 8~ms chunks, we evaluate different speech separation and enhancement tasks with communication delays of up to six chunks or 48~ms. Our results show larger gains where the performance gap between the small and large models is wide, demonstrating a promising method for large-small model collaboration for low-latency applications. Code, dataset, and audio samples available at https://knowledgeboosting.cs.washington.edu/},
url = {http://approjects.co.za/?big=en-us/research/publication/knowledge-boosting-during-low-latency-inference/},
}