@inproceedings{li2016action, author = {Li, Qing and Qiu, Zhaofan and Yao, Ting and Mei, Tao and Rui, Yong and Luo, Jiebo}, title = {Action Recognition by Learning Deep Multi-Granular Spatio-Temporal Video Representation}, booktitle = {ACM International Conference in Multimedia Retrieval (ICMR)}, year = {2016}, month = {June}, abstract = {Recognizing actions in videos is a challenging task as video is an information-intensive media with complex variations. Most existing methods have treated video as a flat data sequence while ignoring the intrinsic hierarchical structure of the video content. In particular, an action may span different granularities in this hierarchy including, from small to large, a single \emph[frame], consecutive frames (\emph[motion]), a short \emph[clip], and the entire \emph[video]. In this paper, we present a novel framework to boost action recognition by learning a deep spatio-temporal video representation at hierarchical multi-granularity. Specifically, we model each granularity as a single stream by 2D (for \emph[frame] and \emph[motion] streams) or 3D (for \emph[clip] and \emph[video] streams) convolutional neural networks (CNNs). The framework therefore consists of multi-stream 2D or 3D CNNs to learn both the spatial and temporal representations. Furthermore, we employ the Long Short-Term Memory (LSTM) networks on the \emph[frame], \emph[motion], and \emph[clip] streams to exploit long-term temporal dynamics. With a \emph[softmax] layer on the top of each stream, the classification scores can be predicted from all the streams, followed by a novel fusion scheme based on the multi-granular score distribution. Our networks are learned in an end-to-end fashion. On two video action benchmarks of UCF101 and HMDB51, our framework achieves promising performance compared with the state-of-the-art.}, url = {http://approjects.co.za/?big=en-us/research/publication/action-recognition-learning-deep-multi-granular-spatio-temporal-video-representation/}, edition = {ACM International Conference in Multimedia Retrieval (ICMR)}, note = {Best Paper Finalist (the first two authors made equal contributions)}, }