@techreport{asadi2016sample-efficient,
author = {Asadi, Kavosh and Williams, Jason},
title = {Sample-efficient Deep Reinforcement Learning for Dialog Control},
year = {2016},
month = {December},
abstract = {Representing a dialog policy as a recurrent neural network (RNN) is attractive because it handles partial observability, infers a latent representation of state, and can be optimized with supervised learning (SL) or reinforcement learning (RL).  For RL, a policy gradient approach is natural, but is sample inefficient.  In this paper, we present 3 methods for reducing the number of dialogs required to optimize an RNN-based dialog policy with RL.  The key idea is to maintain a second RNN which predicts the value of the current policy, and to apply experience replay to both networks.  On two tasks, these methods reduce the number of dialogs/episodes required by about a third, vs. standard policy gradient methods.},
publisher = {arxiv},
url = {http://approjects.co.za/?big=en-us/research/publication/sample-efficient-deep-reinforcement-learning-dialog-control/},
number = {MSR-TR-2016-1134},
}