@inproceedings{roy2021sparkcruise, author = {Roy, Abhishek and Jindal, Alekh and Gomatam, Priyanka and Ouyang, Xiating and Gosalia, Ashit and Ravi, Nishkam and Mann, Swinky and Jain, Prakhar}, title = {SparkCruise: Workload Optimization in Managed Spark Clusters at Microsoft}, booktitle = {VLDB 2021}, year = {2021}, month = {August}, abstract = {Today cloud companies offer fully managed Spark services. This has made it easy to onboard new customers but has also increased the volume of users and their workload sizes. However, both cloud providers and users lack the tools and time to optimize these massive workloads. To solve this problem, we designed SparkCruise that can help understand and optimize workload instances by adding a workload-driven feedback loop to the Spark query optimizer. In this paper, we present our approach to collecting and representing Spark query workloads and use it to improve the overall performance on the workload, all without requiring any access to user data. These methods scale with the number of workloads and apply learned feedback in an online fashion. We explain one specific workload optimization developed for computation reuse. We also share the detailed analysis of production Spark workloads and contrast them with the corresponding analysis of TPC-DS benchmark. To the best of our knowledge, this is the first study to share the analysis of large-scale production Spark SQL workloads.}, url = {http://approjects.co.za/?big=en-us/research/publication/sparkcruise-workload-optimization-in-managed-spark-clusters-at-microsoft/}, }