@misc{yang2025verusage,
author = {Yang, Chenyuan and Neamtu, Natalie and Hawblitzel, Chris and Lorch, Jay and Lu, Shan},
title = {VeruSAGE: A Study of Agent-Based Verification for Rust Systems},
year = {2025},
month = {December},
abstract = {Large language models (LLMs) have shown impressive capability to understand and develop code. However, their capability to rigorously reason about and prove code correctness remains in question. This paper offers a comprehensive study of LLMs' capability to develop correctness proofs for system software written in Rust. We curate a new system-verification benchmark suite, VeruSAGE-Bench, which consists of 849 proof tasks extracted from eight open-source Verus-verified Rust systems. Furthermore, we design different agent systems to match the strengths and weaknesses of different LLMs (o4-mini, GPT-5, Sonnet 4, and Sonnet 4.5). Our study shows that different tools and agent settings are needed to stimulate the system-verification capability of different types of LLMs. The best LLM-agent combination in our study completes over 80% of system-verification tasks in VeruSAGE-Bench. It also completes over 90% of a set of system proof tasks not part of VeruSAGE-Bench because they had not yet been finished by human experts. This result shows the great potential for LLM-assisted development of verified system software.

Our VeruSAGE-Bench and all the implementation and results are released at verus-proof-synthesis/benchmarks/VeruSAGE-Bench at main · microsoft/verus-proof-synthesis},
url = {http://approjects.co.za/?big=en-us/research/publication/verusage-a-study-of-agent-based-verification-for-rust-systems/},
}