@inproceedings{laroche2018safe, author = {Laroche, Romain and Trichelair, Paul}, title = {Safe Policy Improvement with Baseline Bootstrapping}, booktitle = {European Workshop for Reinforcement Learning (EWRL)}, year = {2018}, month = {October}, abstract = {In this paper, we consider the Batch Reinforcement Learning task and adopt the safe policy improvement (SPI) approach: we compute a target policy guaranteed to perform at least as well as a given baseline policy, approximately and with high probability. Our SPI strategy, inspired by the knows-what-it-knows paradigm, consists in bootstrapping the target with the baseline when the target does not know. We develop a policy-based computationally efficient bootstrapping algorithm, accompanied by theoretical SPI bounds for the tabular case. We empirically show the limits of the existing algorithms on a small stochastic gridworld problem, and then demonstrate that our algorithm not only improve the worst-case scenario but also the mean performance.}, url = {http://approjects.co.za/?big=en-us/research/publication/safe-policy-improvement-with-baseline-bootstrapping/}, }