@inproceedings{wang2022text, author = {Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang (姜大昕), Daxin and Majumder, Rangan and Wei, Furu}, title = {Text Embeddings by Weakly-Supervised Contrastive Pre-training}, year = {2022}, month = {December}, abstract = {This paper presents E5, a family of state-of-the-art text embeddings that transfer well to a wide range of tasks. The model is trained in a contrastive manner with weak supervision signals from our curated large-scale text pair dataset (called CCPairs). E5 can be readily used as a general-purpose embedding model for any tasks requiring a single-vector representation of texts such as retrieval, clustering, and classification, achieving strong performance in both zero-shot and fine-tuned settings. We conduct extensive evaluations on 56 datasets from the BEIR and MTEB benchmarks. For zero-shot settings, E5 is the first model that outperforms the strong BM25 baseline on the BEIR retrieval benchmark without using any labeled data. When fine-tuned, E5 obtains the best results on the MTEB benchmark, beating existing embedding models with 40x more parameters.}, publisher = {arXiv}, url = {http://approjects.co.za/?big=en-us/research/publication/text-embeddings-by-weakly-supervised-contrastive-pre-training/}, }