@inproceedings{yu2024wavecoder, author = {Yu, Zhaojian and Zhang, Xin and Shang, Ning and Huang, Yangyu and Xu, Can and Zhao, Yishujie and Hu, Wenxiang and Yin, Qiufeng}, title = {WaveCoder: Widespread And Versatile Enhanced Instruction Tuning with Refined Data Generation}, booktitle = {2024 Meeting of the Association for Computational Linguistics}, year = {2024}, month = {October}, abstract = {Recent work demonstrates that, after instruction tuning, Code Large Language Models (Code LLMs) can obtain impressive capabilities to address a wide range of code-related tasks. However, current instruction tuning methods for Code LLMs mainly focus on the traditional code generation task, resulting in poor performance in complex multi-task scenarios. In this paper, we concentrate on multiple code-related tasks and present WaveCoder, a series of Code LLMs trained with Widespread And Versatile Enhanced instruction data. To enable the models to tackle complex coderelated tasks, we propose a method to stably generate diverse, high-quality instruction data from open source code dataset in multitask scenarios and obtain CodeSeaXDataset, a dataset comprising 19,915 instruction instances across 4 code-related tasks, which is aimed at improving the generalization ability of Code LLM. Our experiments demonstrate that WaveCoder models significantly outperform other open-source models in terms of the generalization ability across different code-related tasks.}, url = {http://approjects.co.za/?big=en-us/research/publication/wavecoder-widespread-and-versatile-enhanced-instruction-tuning-with-refined-data-generation/}, }