@inproceedings{dekel2014bandits,
author = {Dekel, Ofer and Ding, Jian and Koren, Tomer and Peres, Yuval},
title = {Bandits with Switching Costs: T 2/3 Regret},
booktitle = {In Proceedings of the 46th Annual Symposium on the Theory of Computing, 2014},
year = {2014},
month = {May},
abstract = {We study the adversarial multi-armed bandit problem in a setting where the player incurs a unit cost each time he switches actions. We prove that the player’s T-round minimax regret in this setting is Θ( e T 2/3 ), thereby closing a fundamental gap in our understanding of learning with bandit feedback. In the corresponding full-information version of the problem, the minimax regret is known to grow at a much slower rate of Θ(√ T). The difference between these two rates provides the first indication that learning with bandit feedback can be significantly harder than learning with fullinformation feedback (previous results only showed a different dependence on the number of actions, but not on T.) In addition to characterizing the inherent difficulty of the multi-armed bandit problem with switching costs, our results also resolve several other open problems in online learning. One direct implication is that learning with bandit feedback against bounded-memory adaptive adversaries has a minimax regret of Θ( e T 2/3 ). Another implication is that the minimax regret of online learning in adversarial Markov decision processes (MDPs) is Θ( e T 2/3 ). The key to all of our results is a new randomized construction of a multi-scale random walk, which is of independent interest and likely to prove useful in additional settings},
publisher = {ACM},
url = {http://approjects.co.za/?big=en-us/research/publication/bandits-switching-costs-t-23-regret/},
edition = {In Proceedings of the 46th Annual Symposium on the Theory of Computing, 2014},
}