@inproceedings{vulimiri2015global, author = {Vulimiri, Ashish and Curino, Carlo and Godfrey, P. Brighten and Jungblut, Thomas and Padhye, Jitu and Varghese, George}, title = {Global Analytics in the Face of Bandwidth and Regulatory Constraints}, booktitle = {12th USENIX Symposium on Networked Systems Design and Implementation (NSDI '15)}, year = {2015}, month = {May}, abstract = {Global-scale organizations produce large volumes of data across geographically distributed data centers. Querying and analyzing such data as a whole introduces new research issues at the intersection of networks and databases. Today systems that compute SQL analytics over geographically distributed data operate by pulling all data to a central location. This is problematic at large data scales due to expensive transoceanic links, and may be rendered impossible by emerging regulatory constraints. The new problem of Wide-Area Big Data (WABD) consists in orchestrating query execution across data centers to minimize bandwidth while respecting regulatory constaints. WABD combines classical query planning with novel network-centric mechanisms designed for a wide-area setting such as pseudo-distributed execution, joint query optimization, and deltas on cached subquery results. Our prototype, Geode, builds upon Hive and uses 250× less bandwidth than centralized analytics in a Microsoft production workload and up to 360× less on popular analytics benchmarks including TPC-CH and Berkeley Big Data. Geode supports all SQL operators, including Joins, across global data.}, url = {http://approjects.co.za/?big=en-us/research/publication/global-analytics-face-bandwidth-regulatory-constraints/}, isbn = {978-1-931971-218}, edition = {12th USENIX Symposium on Networked Systems Design and Implementation (NSDI ’15)}, }