@inproceedings{saha2021adversarial,
author = {Saha, Aadirupa and Koren, Tomer and Mansour, Yishay},
title = {Adversarial Dueling Bandits},
booktitle = {International Conference on Machine Learning},
year = {2021},
month = {July},
abstract = {We introduce the problem of regret minimization in Adversarial Dueling Bandits. As in classic Dueling Bandits, the learner has to repeatedly choose a pair of items and observe only a relative binary `win-loss' feedback for this pair, but here this feedback is generated from an arbitrary preference matrix, possibly chosen adversarially. Our main result is an algorithm whose $T$-round regret compared to the \emph[Borda-winner] from a set of $K$ items is $\tilde[O](K^[1/3]T^[2/3])$, as well as a matching $\OmesdfsdfK^[1/3]T^[2/3])$ lower bound. We also prove a similar high probability regret bound. We further consider a simpler \emph[fixed-gap] adversarial setup, which bridges between two extreme preference feedback models for dueling bandits: stationary preferences and an arbitrary sequence of preferences. For the fixed-gap adversarial setup we give an $\smash[ \tilde[O]((K/\Delta^2)\log[T]) ]$ regret algorithm, where $\Delta$ is the gap in Borda scores between the best item and all other items, and show a lower bound of $\OmesdfsdfK/\Delta^2)$ indicating that our dependence on the main problem parameters $K$ and $\Delta$ is tight (up to logarithmic factors).},
url = {http://approjects.co.za/?big=en-us/research/publication/optimal-regret-algorithm-for-pseudo-1d-bandit-convex-optimization/},
}