@techreport{chandramouli2010accurate,
author = {Chandramouli, Badrish and Goldstein, Jonathan and Barga, Roger and Riedewald, Mirek and Santos, Ivo and },
title = {Accurate Latency Estimation in a Distributed Event Processing System},
year = {2010},
month = {July},
abstract = {A distributed event processing system consists of one or more nodes (machines), and can execute a directed acyclic graph (DAG) of operators called a dataflow (or query), over long-running high-event-rate data sources. An important component of such a system is cost estimation, which predicts or estimates the "goodness" of a given input, i.e., operator graph and/or assignment of individual operators to nodes. Cost estimation is the foundation for solving many problems: optimization (plan selection and distributed operator placement), provisioning, admission control, and user reporting of system misbehavior.

Latency is a significant user metric in many commercial real-time applications. Users are usually interested in quantiles of latency, such as worst-case or 99th percentile. However, existing cost estimation techniques for event-based dataflows use metrics that, while they may have the side-effect of being correlated with latency, do not directly or provably estimate latency. In this paper, we propose a new cost estimation technique using a metric called Mace (Maximum cumulative excess). Mace is provably equivalent to maximum system latency in a (potentially complex, multi-node) distributed event-based system. The close relationship to latency makes Mace ideal for addressing the problems described earlier. Experiments with real-world datasets on Microsoft StreamInsight deployed over 1-13 nodes in a data center validate our ability to closely estimate latency (within 4%), and the use of Mace for plan selection and distributed operator placement.},
url = {http://approjects.co.za/?big=en-us/research/publication/accurate-latency-estimation-in-a-distributed-event-processing-system/},
number = {MSR-TR-2010-146},
}