@inproceedings{venkataraman2014the, author = {Venkataraman, Shivaram and Panda, Aurojit and Ananthanarayanan, Ganesh and Franklin, Michael J. and Stoica, Ion}, title = {The Power of Choice in Data-Aware Cluster Scheduling}, booktitle = {USENIX OSDI}, year = {2014}, month = {October}, abstract = {Providing timely results in the face of rapid growth in data volumes has become important for analytical frameworks. For this reason, frameworks increasingly operate on only a subset of the input data. A key property of such sampling is that combinatorially many subsets of the input are present. We present KMN, a system that leverages these choices to perform data-aware scheduling, i.e., minimize time taken by tasks to read their inputs, for a DAG of tasks. KMN not only uses choices to co-locate tasks with their data but also percolates such combinatorial choices to downstream tasks in the DAG by launching a few additional tasks at every upstream stage. Evaluations using workloads from Facebook and Conviva on a 100-machine EC2 cluster show that KMN reduces average job duration by 81% using just 5% additional resources.}, url = {http://approjects.co.za/?big=en-us/research/publication/power-choice-data-aware-cluster-scheduling/}, isbn = {978-1-931971-16-4}, edition = {USENIX OSDI}, }