@inproceedings{zhang2023the, author = {Zhang, Lei and Xie, Zhiqiang and Anand, Vaastav and Vigfusson, Ymir and Mace, Jonathan}, title = {The Benefit of Hindsight: Tracing Edge-Cases in Distributed Systems}, booktitle = {2023 Symposium on Networked Systems Design and Implementation}, year = {2023}, month = {April}, abstract = {Today's distributed tracing frameworks are ill-equipped to troubleshoot rare edge-case requests. The crux of the problem is a trade-off between specificity and overhead. On the one hand, frameworks can indiscriminately select requests to trace when they enter the system (head sampling), but this is unlikely to capture a relevant edge-case trace because the framework cannot know which requests will be problematic until after-the-fact. On the other hand, frameworks can trace everything and later keep only the interesting edge-case traces (tail sampling), but this has high overheads on the traced application and enormous data ingestion costs. In this paper we circumvent this trade-off for any edge-case with symptoms that can be programmatically detected, such as high tail latency, errors, and bottlenecked queues. We propose a lightweight and always-on distributed tracing system, Hindsight, which implements a retroactive sampling abstraction: instead of eagerly ingesting and processing traces, Hindsight lazily retrieves trace data only after symptoms of a problem are detected. Hindsight is analogous to a car dash-cam that, upon detecting a sudden jolt in momentum, persists the last hour of footage. Developers using Hindsight receive the exact edge-case traces they desire without undue overhead or dependence on luck. Our evaluation shows that Hindsight scales to millions of requests per second, adds nanosecond-level overhead to generate trace data, handles GB/s of data per node, transparently integrates with existing distributed tracing systems, and successfully persists full, detailed traces in real-world use cases when edge-case problems are detected.}, publisher = {ACM}, url = {http://approjects.co.za/?big=en-us/research/publication/the-benefit-of-hindsight-tracing-edge-cases-in-distributed-systems/}, }