@inproceedings{wu2024privately, author = {Wu, Fan and Inan, Huseyin and Backurs, Arturs and Chandrasekaran, Varun and Kulkarni, Janardhan (Jana) and Sim, Robert}, title = {Privately Aligning Language Models with Reinforcement Learning}, booktitle = {ICLR 2024}, year = {2024}, month = {May}, abstract = {Positioned between pre-training and user deployment, aligning large language models (LLMs) through reinforcement learning (RL) has emerged as a prevailing strategy for training instruction following-models such as ChatGPT. In this work, we initiate the study of privacy-preserving alignment of LLMs through Differential Privacy (DP) in conjunction with RL. Following the influential work of Ziegler et al. (2020), we study two dominant paradigms: (i) alignment via RL without human in the loop (e.g., positive review generation) and (ii) alignment via RL from human feedback (RLHF) (e.g., summarization in a human-preferred way). We give a new DP framework to achieve alignment via RL, and prove its correctness. Our experimental results validate the effectiveness of our approach, offering competitive utility while ensuring strong privacy protections.}, url = {http://approjects.co.za/?big=en-us/research/publication/privately-aligning-language-models-with-reinforcement-learning/}, }