@inproceedings{li2023dual-alignment,
author = {Li, Ziheng and Huang, Shaohan and Zhang, Zihan and Deng, Zhi-Hong and Lou, Qiang and Huang, Haizhen and Jiao, Jian and Wei, Furu and Deng, Weiwei and Zhang, Qi},
title = {Dual-Alignment Pre-training for Cross-lingual Sentence Embedding},
booktitle = {ACL 2023},
year = {2023},
month = {May},
abstract = {Recent studies have shown that dual encoder models trained with the sentence-level translation ranking task are effective methods for cross-lingual sentence embedding. However, our research indicates that token-level alignment is also crucial in multilingual scenarios, which has not been fully explored previously. Based on our findings, we propose a dual-alignment pre-training (DAP) framework for cross-lingual sentence embedding that incorporates both sentence-level and token-level alignment. To achieve this, we introduce a novel representation translation learning (RTL) task, where the model learns to use one-side contextualized token representation to reconstruct its translation counterpart. This reconstruction objective encourages the model to embed translation information into the token representation. Compared to other token-level alignment methods such as translation language modeling, RTL is more suitable for dual encoder architectures and is computationally efficient. Extensive experiments on three sentence-level cross-lingual benchmarks demonstrate that our approach can significantly improve sentence embedding. Our code is available at this https URL.},
url = {http://approjects.co.za/?big=en-us/research/publication/dual-alignment-pre-training-for-cross-lingual-sentence-embedding/},
}