@techreport{li2014on,
author = {Li, Lihong},
title = {On Minimax Optimal Offline Policy Evaluation},
year = {2014},
month = {September},
abstract = {This paper studies the off-policy evaluation problem, where one aims to estimate the value of a target policy based on a sample of observations collected by another policy. We first consider the multi-armed bandit case, establish a minimax risk lower bound, and analyze the risk of two standard estimators. It is shown, and verified in simulation, that one is minimax optimal up to a constant, while another can be arbitrarily worse, despite its empirical success and popularity. The results are applied to related problems in contextual bandits and fixed-horizon Markov decision processes, and are also related to semi-supervised learning.},
publisher = {Microsoft Research},
url = {http://approjects.co.za/?big=en-us/research/publication/on-minimax-optimal-offline-policy-evaluation/},
number = {MSR-TR-2014-124},
}