@inproceedings{beygelzimer2011contextual,
author = {Beygelzimer, Alina and Langford, John and Li, Lihong and Reyzin, Lev and Schapire, Robert E.},
title = {Contextual bandit algorithms with supervised learning guarantees},
organization = {Artiﬁcial Intelligence and Statistics},
booktitle = {AISTATS 2011},
year = {2011},
month = {October},
abstract = {We address the problem of competing with any large set of N policies in the nonstochastic bandit setting, where the learner must repeatedly select among K actions but observes only the reward of the chosen action.

We present a modiﬁcation of the Exp4 algorithm of Auer et al. [2], called Exp4.P, which with high probability incurs regret at most O(√KT lnN). Such a bound does not hold for Exp4 due to the large variance of the importance-weighted estimates used in the algorithm. The new algorithm is tested empirically in a large-scale, real-world dataset. For the stochastic version of the problem, we can use Exp4.P as a subroutine to compete with a possibly inﬁnite set of policies of VCdimension d while incurring regret at most O(√TdlnT) with high probability.

These guarantees improve on those of all previous algorithms, whether in a stochastic or adversarial environment, and bring us closer to providing guarantees for this setting that are comparable to those in standard supervised learning.},
url = {http://approjects.co.za/?big=en-us/research/publication/contextual-bandit-algorithms-with-supervised-learning-guarantees-2/},
}