@inproceedings{hao2025scaling, author = {Hao, Zixu and Wei, Jianyu and Wang, Tuowei and Huang, Minxing and Jiang, Huiqiang and Jiang, Shiqi and Cao, Ting and Ren, Ju}, title = {Scaling LLM Test-Time Compute with Mobile NPU on Smartphones}, organization = {ACM}, booktitle = {EuroSys 2026}, year = {2025}, month = {November}, abstract = {Deploying Large Language Models (LLMs) on mobile devices faces the challenge of insufficient performance in smaller models and excessive resource consumption in larger ones. This paper highlights that mobile Neural Processing Units (NPUs) have underutilized computational resources, particularly their matrix multiplication units, during typical LLM inference. To leverage this idle compute capacity, we proposes applying test-time scaling techniques on mobile NPUs to enhance the performance of smaller LLMs. However, this approach confronts inherent NPU challenges, such as inadequate hardware support for fine-grained quantization and low efficiency in general-purpose computations. We address these by designing and implementing an end-to-end LLM inference system for Qualcomm Hexagon NPUs. This system incorporates hardware-aware, fine-grained tile group quantization, weight rearrangement and quantization group coalescing, as well as LUT-based transformations to accelerate Softmax and dequantization processes. Experiments demonstrate that by utilizing the NPU's idle compute power, our system enables smaller models with time-time scaling to achieve a better accuracy-latency trade-off than larger models without test-time scaling, paving new avenues for deploying high-performance small LLMs on mobile devices. To our knowledge, this is the first work to explore LLM test-time scaling workloads on mobile devices.}, url = {http://approjects.co.za/?big=en-us/research/publication/scaling-llm-test-time-compute-with-mobile-npu-on-smartphones/}, }