@unpublished{barke2026agentrx,
author = {Barke, Shraddha and Goyal, Arnav and Khare, Alind and Singh, Avaljot and Nath, Suman and Bansal, Chetan},
title = {AgentRx: Diagnosing AI Agent Failures from Execution Trajectories},
year = {2026},
month = {February},
abstract = {AI agents often fail in ways that are difficult to localize because executions are probabilistic, long-horizon, multi-agent, and mediated by noisy tool outputs. We address this gap by manually annotating failed agent runs and release a novel benchmark of 115 failed trajectories spanning structured API workflows, incident management, and open-ended web/file tasks. Each trajectory is annotated with a critical failure step and a category from a grounded-theory derived, cross domain failure taxonomy. To mitigate the human cost of failure attribution, we present AgentRx, an automated domain-agnostic diagnostic framework that pinpoints the critical failure step in a failed agent trajectory. It synthesizes constraints, evaluates them step-by-step, and produces an auditable validation log of constraint violations with associated evidence; an LLM-based judge uses this log to localize the critical step and category. Our framework improves step localization and failure attribution over existing baselines across three domains.},
url = {http://approjects.co.za/?big=en-us/research/publication/agentrx-diagnosing-ai-agent-failures-from-execution-trajectories/},
}