@inproceedings{chen2021kb-vlp, author = {Chen, Kezhen and Huang, Qiuyuan and Bisk, Yonatan and McDuff, Daniel and Gao, Jianfeng}, title = {KB-VLP: Knowledge Based Vision and Language Pretraining}, booktitle = {Proceedings of the 38 th International Conference on Machine Learning, PMLR 139, 2021. ICML, workshop, 2021}, year = {2021}, month = {July}, abstract = {Transformer-based pretraining techniques have achieved impressive performance on learning cross-model representations for various multimodal tasks. However, off-the-shelf models do not take advantage of commonsense knowledge and logical reasoning that are crucial to many real world tasks. To this end, we introduce a novel pretraining approach - Knowledge Based Vision and Language Pretraining (KB-VLP) - which uses knowledge graph embeddings extracted from text and detected image object tags to enhance the learning of semantically aligned and knowledge aware representations, and improve the models generalization, and interpretability. KB-VLP is pretrained on a large image-text corpus and automatically extracted knowledge embeddings, and then finetuned on several downstream vision language tasks. Experiments show that KB-VLP significantly improves the performance on VQA, GQA, NLVR2 and OKVQA tasks compared with the baselines.}, url = {http://approjects.co.za/?big=en-us/research/publication/kb-vlp-knowledge-based-vision-and-language-pretraining-2/}, }