@techreport{lipton2016efficient,
author = {Lipton, Zachary C. and Gao, Jianfeng and Li, Lihong and Li, Xiujun and Ahmed, Faisal and Deng, Li},
title = {Efficient Exploration for Dialogue Policy Learning with BBQ Networks & Replay Buffer Spiking},
year = {2016},
month = {August},
abstract = {When rewards are sparse and action spaces large, Q-learning with ϵ  -greedy exploration can be inefficient. This poses problems for otherwise promising applications such as task-oriented dialogue systems, where the primary reward signal, indicating successful completion of a task, requires a complex sequence of appropriate actions. Under these circumstances, a randomly exploring agent might never stumble upon a successful outcome in reasonable time. We present two techniques that significantly improve the efficiency of exploration for deep Q-learning agents in dialogue systems. First, we introduce an exploration technique based on Thompson sampling, drawing Monte Carlo samples from a Bayes-by-backprop neural network, demonstrating marked improvement over common approaches such as ϵ  -greedy and Boltzmann exploration. Second, we show that spiking the replay buffer with experiences from a small number of successful episodes, as are easy to harvest for dialogue tasks, can make Q-learning feasible when it might otherwise fail.},
url = {http://approjects.co.za/?big=en-us/research/publication/efficient-exploration-dialogue-policy-learning-bbq-networks-replay-buffer-spiking/},
edition = {arXiv:1608.05081},
number = {MSR-TR-2016-62},
note = {arXiv:1608.05081},
}