@techreport{balachandran2024eureka,
author = {Balachandran, Vidhisha and Chen, Jingya and Joshi, Neel and Nushi, Besmira and Palangi, Hamid and Salinas, Eduardo and Vineet, Vibhav and Woffinden-Luey, James and Yousefi, Safoora},
title = {EUREKA: Evaluating and Understanding Large Foundation Models},
institution = {Microsoft},
year = {2024},
month = {September},
abstract = {Rigorous and reproducible evaluation of large foundation models is critical for assessing the state of the art, informing next steps in model improvement, and for guiding scientific advances in Artificial Intelligence (AI). Evaluation is also important for informing the increasing number of application developers that build services on foundation models. The evaluation process has however become challenging in practice due to several reasons that require immediate attention from the community, including benchmark saturation, lack of transparency in the methods being deployed for measurement, development challenges in extracting the right measurements for generative tasks, and, more generally, the extensive number of capabilities that need to be considered for showing a well-rounded comparison across models. In addition, despite the overwhelming numbers of side-by-side capability evaluations available, we still lack a deeper understanding about when and how different models fail for a given capability and whether the nature of failures is similar across different models being released over time.


We make three contributions to alleviate the above challenges. First, we present Eureka, a reusable and open evaluation framework for standardizing evaluations of large foundation models beyond single-score reporting and rankings. Second, we introduce Eureka-Bench as an extensible collection of benchmarks testing capabilities that (i) are still challenging for state-of-the-art foundation models and (ii) represent fundamental but overlooked capabilities for completing tasks in both language and vision modalities. The available space for improvement that comes inherently from non-saturated benchmarks, enables us to discover meaningful differences between models at a capability level. Third, using the framework and Eureka-Bench, we conduct an analysis of 12 state-of-the-art models, providing in-depth insights for failure understanding and model comparison by disaggregating the measurements across important subcategories of data. Such insights uncover granular weaknesses of models for a given capability and can then be further leveraged to plan more precisely on what areas are most promising for improvement. Eureka is available as open-source to foster transparent and reproducible evaluation practices.


In contrast to recent trends in evaluation reports and leaderboards showing absolute rankings and claims for one model or another to be the best, our analysis shows that there is no such best model. Different models have different strengths, but there are models that appear more often than others as best performers for several capabilities. Despite the many observed improvements, it also becomes obvious that current models still struggle with a number of fundamental capabilities including detailed image understanding, benefiting from multimodal input when available rather than fully relying on language, factuality and grounding for information retrieval, and over refusals.},
url = {http://approjects.co.za/?big=en-us/research/publication/eureka-evaluating-and-understanding-large-foundation-models/},
number = {MSR-TR-2024-33},
}