@inproceedings{laroche2019spibb-dqn,
author = {Laroche, Romain and Tachet des Combes, Remi},
title = {SPIBB-DQN: Safe Batch Reinforcement Learning with Function Approximation},
booktitle = {The 4th Multidisciplinary Conference on Reinforcement Learning and Decision Making (RLDM)},
year = {2019},
month = {July},
abstract = {We consider Safe Policy Improvement (SPI) in Batch Reinforcement Learning (Batch RL): from a fixed dataset and without direct access to the true environment, train a policy that is guaranteed to perform at least as well as the baseline policy used to collect the data. Our contribution is a model-free version of the SPI with Baseline Bootstrapping (SPIBB) algorithm, called SPIBB-DQN, which consists in applying the Bellman update only in state-action pairs that have been sufficiently sampled in the batch. In low-visited parts of the environment, the trained policy reproduces the baseline. We show its benefits on a navigation task and on CartPole. SPIBBDQN is, to the best of our knowledge, the first RL algorithm relying on a neural network representation able to train efficiently and reliably from batch data, without any interaction with the environment.},
url = {http://approjects.co.za/?big=en-us/research/publication/spibb-dqn-safe-batch-reinforcement-learning-with-function-approximation/},
}