@inproceedings{lin2023efficient, author = {Lin, Bin and Zheng, Ningxin and Wang, Lei and Cao, Shijie and Ma, Lingxiao and Zhang, Quanlu and Zhu, Yi and Cao, Ting and Xue, Jilong and Yang, Yuqing and Yang, Fan}, title = {Efficient GPU Kernels for N:M-SPARSE Weights in Deep Learning}, booktitle = {Sixth Conference on Machine Learning and Systems (MLSys'23)}, year = {2023}, month = {June}, abstract = {N:M sparsity is becoming increasingly popular for its potential to deliver high model accuracy and computational efficiency for deep learning. However, the real-world benefit of N:M sparsity is limited as there is a lack of dedicated GPU kernel implementations for general N:M sparsity with various sparsity ratios. In this work, we introduce nmSPARSE, a library of efficient GPU kernels for two fundamental operations in neural networks with N:M sparse weights: sparse matrix-vector multiplication (SpMV) and sparse matrix-matrix multiplication (SpMM). By exploiting the intrinsic balance characteristic of N:M sparsity, nmSPARSE kernels rearrange irregular computation and scattered memory accesses in sparse matrix multiplication into hardware-aligned regular computation and conflict-free memory accesses at runtime. When evaluated on NVIDIA A100 GPU, nmSPARSE kernels achieve up to 5.2× speedup on SpMV and 6.0× speedup on SpMM over the fastest baseline. End-to-end studies on transformer models demonstrate that using nmSPARSE outperforms other baselines.}, url = {http://approjects.co.za/?big=en-us/research/publication/efficient-gpu-kernels-for-nm-sparse-weights-in-deep-learning/}, }