@inproceedings{wang2024afrimte, author = {Wang, Jiayi and Adelani, David Ifeoluwa and Agrawal, Sweta and Masiak, Marek and Rei, Ricardo and Briakou, Eleftheria and Carpuat, Marine and He, Xuanli and Bourhim, Sofia and Bukula, Andiswa and Mohamed, Muhidin A. and Olatoye, Temitayo and Mokayed, Hamam and Mwase, Christine and Kimotho, Wangui and Yuehgoh, Foutse and Aremu, Anuoluwapo and Ojo, Jessica and Muhammad, Shamsuddeen Hassan and Osei, Salomey and Omotayo, Abdul-Hakeem and Chukwuneke, Chiamaka Ijeoma and Ogayo, Perez and Hourrane, Oumaima and ANIGRI, Salma EL and Ndolela, Lolwethu and Mangwana, Thabiso and Mohamed, Shafie Abdi and Ayinde, Hassan and Awoyomi, Oluwabusayo Olufunke and Alkhaled, Lama and al-Azzawi, Sana Sabah and Etori, Naome A and Ochieng, Millicent and Siro, Clemencia and Kiragu, Njoroge and Muchiri, Eric and Kimotho, Wangari and Sakayo, Toadoum Sari and Wamba, Lyse Naomi Momo and Abolade, Daud and Ajao, Simbiat and Adewumi, Tosin and Shode, Iyanuoluwa and Macharm, Ricky Sambo and Iro, Ruqayya Nasir and Abdullahi, Saheed Salahudeen and Moore, Stephen Edward and Opoku, Bernard and Akinjobi, Zainab and Afolabi, Abeeb and Obiefuna, Nnaemeka Casmir and Ogbu, Onyekachi and Ochieng', Sam Brian and Otiende, Verrah Akinyi and MBONU, CHINEDU EMMANUEL and Lu, Yao and Stenetorp, Pontus}, title = {AfriMTE and AfriCOMET: Enhancing COMET to Embrace Under-resourced African Languages}, booktitle = {North American Chapter of the Association for Computational Linguistics}, year = {2024}, month = {March}, abstract = {Despite the recent progress in scaling multilingual machine translation (MT) to several under-resourced African languages, accurately measuring this progress remains challenging. Evaluation is often performed using n-gram matching metrics such as BLEU, which typically show a weaker correlation with human judgments. Learned metrics like COMET have a higher correlation; however, challenges such as the lack of evaluation data with human ratings for under-resourced languages, the complexity of annotation guidelines like Multidimensional Quality Metrics (MQM), and the limited language coverage of multilingual encoders have hampered their applicability to African languages. In this paper, we address these challenges by creating high-quality human evaluation data with simplified MQM guidelines for error detection and direct assessment (DA) scoring for 13 typologically diverse African languages. Furthermore, we develop AFRICOMET: COMET evaluation metrics for African languages by leveraging DA data from well-resourced languages and an African-centric multilingual encoder (AfroXLM-R) to create the state-of-the-art MT evaluation metrics for African languages with respect to Spearman-rank correlation with human judgments (+0.441).}, publisher = {NAACL 2024}, url = {http://approjects.co.za/?big=en-us/research/publication/afrimte-and-africomet-enhancing-comet-to-embrace-under-resourced-african-languages/}, }