@inproceedings{jiang2016doubly, author = {Jiang, Nan and Li, Lihong}, title = {Doubly Robust Off-policy Evaluation for Reinforcement Learning}, booktitle = {JMLR: Workshop and Conference Proceedings; Proceedings of the 33rd International Conference on Machine Learning (ICML)}, year = {2016}, month = {June}, abstract = {We study the problem of off-policy value evaluation in reinforcement learning (RL), where one aims to estimate the value of a new policy based on data collected by a different policy. This problem is often a critical step when applying RL to real-world problems. Despite its importance, existing general methods either have uncontrolled bias or suffer high variance. In this work, we extend the doubly robust estimator for bandits to sequential decision-making problems, which gets the best of both worlds: it is guaranteed to be unbiased and can have a much lower variance than the popular importance sampling estimators. We demonstrate the estimator’s accuracy in several benchmark problems, and illustrate its use as a subroutine in safe policy improvement. We also provide theoretical results on the inherent hardness of the problem, and show that our estimator can match the lower bound in certain scenarios.}, url = {http://approjects.co.za/?big=en-us/research/publication/doubly-robust-off-policy-evaluation-for-reinforcement-learning-2/}, edition = {JMLR: Workshop and Conference Proceedings; Proceedings of the 33rd International Conference on Machine Learning (ICML)}, }