@inproceedings{jain2025performance, author = {Jain, Kunal and Parayil, A. and Mallick, Ankur and Choukse, Esha and Qin, Xiaoting and Zhang, Jue and Goiri, Íñigo and Wang, Rujia and Bansal, Chetan and Ruehle, Victor and Kulkarni, Anoop and Kofsky, Steve and Rajmohan, Saravan}, title = {Performance Aware LLM Load Balancer for Mixed Workloads}, booktitle = {EuroMLSys 2025}, year = {2025}, month = {April}, abstract = {Large Language Model (LLM) workloads consist of distinct prefill and decode phases, each with unique compute and memory requirements that should be considered when routing input queries across cluster instances. However, existing load-balancing algorithms treat these workloads as monolithic jobs, ignoring the differences between the two phases. This oversight leads to suboptimal query distribution and increased response latency. In our work, we first characterize the factors affecting response latency during LLM inference. We show that balancing inference requests across available LLM instances can improve end-to-end latency more than simply optimizing the instance-level scheduler. Motivated by these findings, we propose a heuristic-guided, reinforcement learning-based router for data-driven, workload-aware scheduling. Our router distributes queries across LLM instances by using a trainable responselength predictor and a novel formulation for estimating the impact of mixing different workloads, achieving over 11% lower end-toend latency than existing methods on mixed public datasets. Our framework represents a first step toward a holistic optimization framework and serves as a benchmark for deriving optimal load balancing strategies tailored to different reward functions and requirements. Beyond latency, we can extend the proposed framework to optimize for various performance criteria ensuring that the system meets diverse operational objectives.}, url = {http://approjects.co.za/?big=en-us/research/publication/performance-aware-llm-load-balancer-for-mixed-workloads/}, }