@inproceedings{yuan2021hrformer, author = {Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong}, title = {HRFormer: High-Resolution Vision Transformer for Dense Predict}, booktitle = {NeurIPS 2021}, year = {2021}, month = {December}, abstract = {We present a High-Resolution Transformer (HRFormer) that learns high-resolution representations for dense prediction tasks, in contrast to the original Vision Transformer that produces low-resolution representations and has high memory and computational cost. We take advantage of the multi-resolution parallel design introduced in high-resolution convolutional networks (HRNet), along with local-window self-attention that performs self-attention over small non-overlapping image windows, for improving the memory and computation efficiency. In addition, we introduce a convolution into the FFN to exchange information across the disconnected image windows. We demonstrate the effectiveness of the High-Resolution Transformer on both human pose estimation and semantic segmentation tasks, e.g., HRFormer outperforms Swin transformer by 1.3 AP on COCO pose estimation with 50% fewer parameters and 30% fewer FLOPs. Code is available at: this https URL.}, url = {http://approjects.co.za/?big=en-us/research/publication/hrformer-high-resolution-vision-transformer-for-dense-predict/}, }