@techreport{he2009comet, author = {He, Bingsheng and Yang, Mao and Guo, Zhenyu and Chen, Rishan and Su, Bing and Lin, Wei and Zhou, Lidong}, title = {Comet: Batched Stream Processing in Data Intensive Distributed Computing}, year = {2009}, month = {December}, abstract = {Performance and resource optimization is an important research problem in data intensive distributed computing. We present a new batched stream processing model that captures query correlations to expose I/O and computation redundancies for optimizations. The model is inspired by our empirical study on a trace from a production large-scale data processing cluster, which reveals significant redundancies caused by strong temporal and spatial correlations among queries. We have developed Comet, a query processing system that embraces the batched stream processing model for optimizations. We have integrated Comet with DryadLINQ. With its roots in query optimizations for database systems, Comet enables a set of new heuristics and opportunities tailored for distributed computing in DryadLINQ. Optimizations in Comet are effective. The evaluation of a micro-benchmark on a 40-machine cluster shows a 42% reduction in total machine time and over 40% reduction in total I/O. Our simulation on a real trace covering over 19 million machine hours shows an estimated I/O saving of over 50%.}, publisher = {Microsoft Research}, url = {http://approjects.co.za/?big=en-us/research/publication/comet-batched-stream-processing-in-data-intensive-distributed-computing/}, number = {MSR-TR-2009-180}, }