@inproceedings{zhong2024dpo,
author = {Zhong, Han and Feng, Guhao and Xiong, Wei and Zhao, Li and He, Di and Bian, Jiang and Wang, Liwei},
title = {DPO Meets PPO: Reinforced Token Optimization for RLHF},
booktitle = {ICML 2025},
year = {2024},
month = {April},
abstract = {In the classical Reinforcement Learning from Human Feedback (RLHF) framework, Proximal Policy Optimization (PPO) is employed to learn from sparse, sentence-level rewards -- a challenging scenario in traditional deep reinforcement learning. Despite the great successes of PPO in the alignment of large language models, its open-source implementation is still largely sub-optimal. To address these issues, we introduce a framework that models RLHF problems as a Markov decision process (MDP), enabling the capture of fine-grained token-wise information. Under this framework, we introduce an algorithm Reinforced Token Optimization (\texttt[RTO]), which learns the token-wise reward function from preference data and performs policy optimization based on this learned token-wise reward signal. Theoretically, \texttt[RTO] is proven to have the capability of finding the near-optimal policy sample-efficiently. For its practical implementation, \texttt[RTO] innovatively integrates Direct Preference Optimization (DPO) and PPO. DPO, originally derived from sparse sentence rewards, surprisingly provides us with a token-wise characterization of response quality, which is seamlessly incorporated into our subsequent PPO training stage. Extensive experiments demonstrate that \texttt[RTO] performs better than PPO and other direct preference learning algorithms. In particular, RTO outperforms PPO by 7.5 points on the AlpacaEval 2 benchmark and by 4.1 points on Arena-Hard. Our code and models are available at \href[https://github.com/zkshan2002/RTO][https://github.com/zkshan2002/RTO].},
url = {http://approjects.co.za/?big=en-us/research/publication/dpo-meets-ppo-reinforced-token-optimization-for-rlhf/},
}