@inproceedings{thomas2018better, author = {Thomas, Paul and Moffat, Alistair and Bailey, Peter and Scholer, Falk and Craswell, Nick}, title = {Better effectiveness metrics for SERPs, cards, and rankings}, booktitle = {Australasian Document Computing Symposium}, year = {2018}, month = {December}, abstract = {Offline metrics for IR evaluation are often derived from a user model that seeks to capture the interaction between the user and the ranking, conflating the interaction with a ranking of documents with the user's interaction with the search results page. A desirable property of any effectiveness metric is if the scores it generates over a set of rankings correlate well with the "satisfaction" or "goodness" scores attributed to those same rankings by a population of searchers. Using data from a large-scale web search engine, we find that offline effectiveness metrics do not correlate well with a behavioural measure of satisfaction that can be inferred from user activity logs. We then examine three mechanisms to improve the correlation: tuning the model parameters; improving the label coverage, so that more kinds of item are labelled and hence included in the evaluation; and modifying the underlying user models that describe the metrics. In combination, these three mechanisms transform a wide range of common metrics into "card-aware" variants which allow for the gain from cards (or snippets), varying probabilities of clickthrough, and good abandonment.}, publisher = {ACM}, url = {http://approjects.co.za/?big=en-us/research/publication/better-effectiveness-metrics-for-serps-cards-and-rankings/}, note = {Best paper award}, }