@inproceedings{dewitt2013split, author = {DeWitt, David J. and Nehme, Rimma and Shankar, Srinath and Aguilar-Saborit, Josep and Avanes, Artin and Flasza, Miro and Gramling, Jim and Halverson, Alan}, title = {Split query processing in polybase}, booktitle = {2013 ACM SIGMOD International Conference on Management of Data}, year = {2013}, month = {June}, abstract = {This paper presents Polybase, a feature of SQL Server PDW V2 that allows users to manage and query data stored in a Hadoop cluster using the standard SQL query language. Unlike other database systems that provide only a relational view over HDFS-resident data through the use of an external table mechanism, Polybase employs a split query processing paradigm in which SQL operators on HDFS-resident data are translated into MapReduce jobs by the PDW query optimizer and then executed on the Hadoop cluster. The paper describes the design and implementation of Polybase along with a thorough performance evaluation that explores the benefits of employing a split query processing paradigm for executing queries that involve both structured data in a relational DBMS and unstructured data in Hadoop. Our results demonstrate that while the use of a split-based query execution paradigm can improve the performance of some queries by as much as 10X, one must employ a cost-based query optimizer that considers a broad set of factors when deciding whether or not it is advantageous to push a SQL operator to Hadoop. These factors include the selectivity factor of the predicate, the relative sizes of the two clusters, and whether or not their nodes are co-located. In addition, differences in the semantics of the Java and SQL languages must be carefully considered in order to avoid altering the expected results of a query.}, publisher = {ACM}, url = {http://approjects.co.za/?big=en-us/research/publication/split-query-processing-in-polybase/}, pages = {1255-1266}, }