@book{li2023multimodal, author = {Li, Chunyuan and Gan, Zhe and Yang, Zhengyuan and Yang, Jianwei and Li, Linjie and Wang, Lijuan and Gao, Jianfeng}, title = {Multimodal Foundation Models: From Specialists to General-Purpose Assistants}, year = {2023}, month = {September}, abstract = {This paper presents a comprehensive survey of the taxonomy and evolution of multimodal foundation models that demonstrate vision and vision-language capabilities, focusing on the transition from specialist models to general-purpose assistants. The research landscape encompasses five core topics, categorized into two classes. (i) We start with a survey of well-established research areas: multimodal foundation models pre-trained for specific purposes, including two topics -- methods of learning vision backbones for visual understanding and text-to-image generation. (ii) Then, we present recent advances in exploratory, open research areas: multimodal foundation models that aim to play the role of general-purpose assistants, including three topics -- unified vision models inspired by large language models (LLMs), end-to-end training of multimodal LLMs, and chaining multimodal tools with LLMs. The target audiences of the paper are researchers, graduate students, and professionals in computer vision and vision-language multimodal communities who are eager to learn the basics and recent advances in multimodal foundation models.}, url = {http://approjects.co.za/?big=en-us/research/publication/multimodal-foundation-models-from-specialists-to-general-purpose-assistants/}, note = {118-page book/survey on the literature review, evolution, trends and our position on multimodal foundation models. CVPR 2023 Tutorial: https://vlp-tutorial.github.io/2023/}, }