@inproceedings{li2021fighting, author = {Li, Liqun and Zhang, Xu and Zhao, Xin and Zhang, Hongyu and Kang, Yu and Zhao, Pu and Qiao, Bo and He, Shilin and Lee, Pochian and Sun, Jeffrey and Gao, Feng and Yang, Li and 林庆维, Qingwei Lin and Rajmohan, Saravanakumar and Xu, Zhangwei and Zhang, Dongmei}, title = {Fighting the Fog of War: Automated Incident Detection for Cloud Systems}, booktitle = {2021 USENIX Annual Technical Conference (USENIX ATC'21)}, year = {2021}, month = {August}, abstract = {Incidents and outages dramatically degrade the availability of large-scale cloud computing systems such as AWS, Azure, and GCP. In current incident response practice, each team has only a partial view of the entire system, which makes the detection of incidents like fighting in the “fog of war". As a result, prolonged mitigation time and more finance loss are incurred. In this work, we propose an automatic incident detection system, namely Warden, as a part of the Incident Management (IcM) platform.Warden collects alerts from different services and detects the occurrence of incidents from a global perspective. For each detected potential incident, Warden notifies relevant on-call engineers so that they could properly prioritize their tasks and initiate cross-team collaboration. We implemented and deployed Warden in the IcM platform of Azure. Our evaluation results based on data collected in an 18-month period from 26 major services show that Warden is effective and outperforms the baseline methods. For the majority of successfully detected incidents ( 68%), Warden is faster than human, and this is particularly the case for the incidents that take long time to detect manually.}, url = {http://approjects.co.za/?big=en-us/research/publication/fighting-the-fog-of-war-automated-incident-detection-for-cloud-systems/}, }