@article{sivathanu2020instalytics, author = {Sivathanu, Muthian and Vuppalapati, Midhul and Gulavani, Bhargav and Rajan, Kaushik and Leeka, Jyoti and Mohan, Jayashree and Kedia, Piyus}, title = {INSTalytics: Cluster Filesystem Co-design for Big-data Analytics}, year = {2020}, month = {February}, abstract = {We present the design, implementation, and evaluation of INSTalytics, a co-designed stack of a cluster file system and the compute layer, for efficient big-data analytics in large-scale data centers. INSTalytics amplifies the well-known benefits of data partitioning in analytics systems; instead of traditional partitioning on one dimension, INSTalytics enables data to be simultaneously partitioned on four different dimensions at the same storage cost, enabling a larger fraction of queries to benefit from partition filtering and joins without network shuffle. To achieve this, INSTalytics uses compute-awareness to customize the three-way replication that the cluster file system employs for availability. A new heterogeneous replication layout enables INSTalytics to preserve the same recovery cost and availability as traditional replication. INSTalytics also uses compute-awareness to expose a new sliced-read API that improves performance of joins by enabling multiple compute nodes to read slices of a data block efficiently via co-ordinated request scheduling and selective caching at the storage nodes. We have built a prototype implementation of INSTalytics in a production analytics stack, and we show that recovery performance and availability is similar to physical replication, while providing significant improvements in query performance, suggesting a new approach to designing cloud-scale big-data analytics systems.}, url = {http://approjects.co.za/?big=en-us/research/publication/instalytics-cluster-filesystem-co-design-for-big-data-analytics/}, pages = {1-30}, journal = {ACM Transactions on Storage}, volume = {15}, number = {4}, note = {Invited Paper: USENIX FAST 2019 Special Section}, }