@inproceedings{wang2024combinatorial,
author = {Wang, Yiliu and Chen, Wei and Vojnovic, Milan},
title = {Combinatorial Bandits for Maximum Value Reward Function under Value-index Feedback},
booktitle = {Proceedings of the 12th International Conference on Learning Representations (ICLR)},
year = {2024},
month = {May},
abstract = {We investigate the combinatorial multi-armed bandit problem where an action is to select k arms from a set of base arms, and its reward is the maximum of the sample values of these k arms, under a weak feedback structure that only returns the value and index of the arm with the maximum value. This novel feedback structure is much weaker than the semi-bandit feedback previously studied and is only slightly stronger than the full-bandit feedback, and thus it presents a new challenge for the online learning task. We propose an algorithm and derive a regret bound for instances where arm outcomes follow distributions with finite supports. Our algorithm introduces a novel concept of biased arm replacement to address the weak feedback challenge, and it achieves a distribution-dependent regret bound of O((nk/Δ) log(T)) and a distribution-independent regret bound of $\tilde[O](\sqrt[nkT])$, where Δ is the reward gap and T is the time horizon. Notably, our regret bound is comparable to the bounds obtained under the more informative semi-bandit feedback. We demonstrate the effectiveness of our algorithm through experimental results.},
url = {http://approjects.co.za/?big=en-us/research/publication/combinatorial-bandits-for-maximum-value-reward-function-under-value-index-feedback/},
}