@inproceedings{li2015toward, author = {Li, Lihong and Munos, Remi and Szepesvari, Csaba}, title = {Toward Minimax Off-policy Value Estimation}, booktitle = {Proceedings of the 18th International Conference on Artificial Intelligence and Statistics (AISTATS)}, year = {2015}, month = {May}, abstract = {This paper studies the off-policy evaluation problem, where one aims to estimate the value of a target policy based on a sample of observations collected by another policy. We first consider the single-state, or multi-armed bandit case, establish a finite-time minimax risk lower bound, and analyze the risk of three standard estimators. For the so-called regression estimator, we show that while it is asymptotically optimal, for small sample sizes it may perform suboptimally compared to an ideal oracle up to a multiplicative factor that depends on the number of actions. We also show that the other two popular estimators can be arbitrarily worse than the optimal, even in the limit of infinitely many data points. The performance of the estimators are studied in synthetic and real problems; illustrating the methods strengths and weaknesses. We also discuss the implications of these results for off-policy evaluation problems in contextual bandits and fixed-horizon Markov decision processes.}, publisher = {JMLR: Workshop and Conference Proceedings}, url = {http://approjects.co.za/?big=en-us/research/publication/toward-minimax-off-policy-value-estimation/}, edition = {Proceedings of the 18th International Conference on Artificial Intelligence and Statistics (AISTATS)}, }