@inproceedings{foster2019model, author = {Foster, Dylan and Krishnamurthy, Akshay and Luo, Haipeng}, title = {Model selection for contextual bandits}, booktitle = {Advances in Neural Information Processing Systems}, year = {2019}, month = {December}, abstract = {We introduce the problem of model selection for contextual bandits, wherein a learner must adapt to the complexity of the optimal policy while balancing exploration and exploitation. Our main result is a new model selection guarantee for linear contextual bandits. We work in the stochastic realizable setting with a sequence of nested linear policy classes of dimension $d_1 < d_2 < \ldots$ where the $m^\star$-th class contains the optimal policy, and we design an algorithm that achieves $\tilde[O](T^[2/3]d_[m^\star]^[1/3])$ regret with no prior knowledge of the optimal dimension $d_[m^\star]$. The algorithm also achieves regret $\tilde[O](T^[3/4] + \sqrt[Td_[m^\star]])$, which is optimal for $d_[m^\star] \geq \sqrt[T]$. This is the first contextual bandit model selection result with non-vacuous regret for all values of $d_[m^\star]$ and, to the best of our knowledge, is the first guarantee of its type in any contextual bandit setting. The core of the algorithm is a new estimator for the gap in best loss achievable by two linear policy classes, which we show admits a convergence rate faster than what is required to learn either class.}, url = {http://approjects.co.za/?big=en-us/research/publication/model-selection-for-contextual-bandits/}, }