@inproceedings{gehrmann2021the, author = {Gehrmann, Sebastian and Adewumi, Tosin and Aggarwal, Karmanya and Sasanka Ammanamanchi, Pawan and Aremu, Anuoluwapo and Bosselut, Antoine and Raghavi Chandu, Khyathi and Clinciu, Miruna-Adriana and Das, Dipanjan and Dhole, Kaustubh and Du, Wanyu and Durmus, Esin and Dušek, Ondřej and Chinenye Emezue, Chris and Gangal, Varun and Garbacea, Cristina and Hashimoto, Tatsunori and Hou, Yufang and Jernite, Yacine and Jhamtani, Harsh and Ji, Yangfeng and Jolly, Shailza and Kale, Mihir and Kumar, Dhruv and Ladhak, Faisal and Madaan, Aman and Maddela, Mounica and Mahajan, Khyati and Mahamood, Saad and Prasad Majumder, Bodhisattwa and Henrique Martins, Pedro and McMillan-Major, Angelina and Mille, Simon and van Miltenburg, Emiel and Nadeem, Moin and Narayan, Shashi and Nikolaev, Vitaly and Niyongabo Rubungo, Andre and Osei, Salomey and Parikh, Ankur and Perez-Beltrachini, Laura and Ramesh Rao, Niranjan and Raunak, Vikas and Diego Rodriguez, Juan and Santhanam, Sashank and Sedoc, João and Sellam, Thibault and Shaikh, Samira and Shimorina, Anastasia and Antonio Sobrevilla Cabezudo, Marco and Strobelt, Hendrik and Subramani, Nishant and Xu, Wei and Yang, Diyi and Yerukola, Akhila and Zhou, Jiawei}, title = {The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics}, booktitle = {2021 Natural Language Generation}, year = {2021}, month = {February}, abstract = {We introduce GEM, a living benchmark for natural language Generation (NLG), its Evaluation, and Metrics. Measuring progress in NLG relies on a constantly evolving ecosystem of automated metrics, datasets, and human evaluation standards. Due to this moving target, new models often still evaluate on divergent anglo-centric corpora with well-established, but flawed, metrics. This disconnect makes it challenging to identify the limitations of current models and opportunities for progress. Addressing this limitation, GEM provides an environment in which models can easily be applied to a wide set of tasks and in which evaluation strategies can be tested. Regular updates to the benchmark will help NLG research become more multilingual and evolve the challenge alongside models. This paper serves as the description of the data for the 2021 shared task at the associated GEM Workshop.}, publisher = {Association for Computational Linguistics}, url = {http://approjects.co.za/?big=en-us/research/publication/the-gem-benchmark-natural-language-generation-its-evaluation-and-metrics/}, pages = {96-120}, }