@inproceedings{thomas2018better,
author = {Thomas, Paul and Moffat, Alistair and Bailey, Peter and Scholer, Falk and Craswell, Nick},
title = {Better effectiveness metrics for SERPs, cards, and rankings},
booktitle = {Australasian Document Computing Symposium},
year = {2018},
month = {December},
abstract = {Offline metrics for IR evaluation are often derived from a user model that seeks to capture the interaction between the user and the ranking, conflating the interaction with a ranking of documents with the user's interaction with the search results page. A desirable property of any effectiveness metric is if the scores it generates over a set of rankings correlate well with the "satisfaction" or "goodness" scores attributed to those same rankings by a population of searchers.
Using data from a large-scale web search engine, we find that offline effectiveness metrics do not correlate well with a behavioural measure of satisfaction that can be inferred from user activity logs. We then examine three mechanisms to improve the correlation: tuning the model parameters; improving the label coverage, so that more kinds of item are labelled and hence included in the evaluation; and modifying the underlying user models that describe the metrics. In combination, these three mechanisms transform a wide range of common metrics into "card-aware" variants which allow for the gain from cards (or snippets), varying probabilities of clickthrough, and good abandonment.},
publisher = {ACM},
url = {http://approjects.co.za/?big=en-us/research/publication/better-effectiveness-metrics-for-serps-cards-and-rankings/},
note = {Best paper award},
}