@inproceedings{gakhar2022pipemizer, author = {Gakhar, Sunny and Cahoon, Joyce and Le, Wangchao and Li, Xiangnan and Ravichandran, Kaushik and Patel, Hiren and Friedman, Marc and Haynes, Brandon and Qiao, Shi and Jindal, Alekh and Leeka, Jyoti}, title = {Pipemizer: An Optimizer for Analytics Data Pipelines}, booktitle = {PVLDB}, year = {2022}, month = {September}, abstract = {Pipemizer is an optimizer and recommender aimed at improving the performance of queries or jobs in pipelines. These job pipelines are ubiquitous in modern data analytics due to jobs reading output files written by other jobs. Given that more than 650k jobs run on Microsoft’s SCOPE job service per day and about 70% have inter-job dependencies, identifying optimization opportunities across query jobs is of considerable interest to both cluster operators and users. Pipemizer addresses this need by providing recommendations to users, allowing users to understand their system, and facilitating automated application of recommendations. Pipemizer introduces novel optimizations that include holistic pipeline-aware statistics generation, inter-job operator push-up, and job split & merge.}, url = {http://approjects.co.za/?big=en-us/research/publication/pipemizer-an-optimizer-for-analytics-data-pipelines/}, }