{"id":999750,"date":"2024-01-30T05:22:13","date_gmt":"2024-01-30T13:22:13","guid":{"rendered":"https:\/\/www.microsoft.com\/en-us\/research\/?post_type=msr-blog-post&#038;p=999750"},"modified":"2024-06-10T09:57:52","modified_gmt":"2024-06-10T16:57:52","slug":"kahani-visual-storytelling-through-culturally-nuanced-images","status":"publish","type":"msr-blog-post","link":"https:\/\/www.microsoft.com\/en-us\/research\/articles\/kahani-visual-storytelling-through-culturally-nuanced-images\/","title":{"rendered":"Kahani: Visual Storytelling through Culturally Nuanced Images"},"content":{"rendered":"\n<p class=\"has-purple-color has-text-color has-link-color wp-elements-3328a4575af49165039ce11f64848a49\"><em>Presented by <a href=\"https:\/\/www.microsoft.com\/en-us\/research\/people\/sameersegal\/\" target=\"_blank\" rel=\"noreferrer noopener\">Sameer Segal<\/a> at <strong>Microsoft Research Forum, January 2024<\/strong><\/em><\/p>\n\n\n\n<div class=\"wp-block-media-text has-vertical-margin-none  has-vertical-padding-none  is-stacked-on-mobile has-white-background-color has-background\" style=\"grid-template-columns:25% auto\"><figure class=\"wp-block-media-text__media\"><img loading=\"lazy\" decoding=\"async\" width=\"360\" height=\"360\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Sameer-Segal_360x360.jpg\" alt=\"Sameer Segal headshot\" class=\"wp-image-999753 size-full\" srcset=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Sameer-Segal_360x360.jpg 360w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Sameer-Segal_360x360-300x300.jpg 300w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Sameer-Segal_360x360-150x150.jpg 150w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Sameer-Segal_360x360-180x180.jpg 180w\" sizes=\"auto, (max-width: 360px) 100vw, 360px\" \/><\/figure><div class=\"wp-block-media-text__content\">\n<blockquote class=\"wp-block-quote is-style-spectrum is-layout-flow wp-block-quote-is-layout-flow\">\n<p>\u201c[Project Kahani is] trying to bring not only visually stunning images but also bring in cultural nuances to it. Past work has shown that diffusion models tend to stereotype and fail to understand local words, but they don\u2019t provide ways to overcome these shortcomings without modifying the model or using fine-tuning.\u201d<\/p>\n<cite><em>\u2013<\/em> Sameer Segal, Principal Research Software Development Engineer<\/cite><\/blockquote>\n<\/div><\/div>\n\n\n\n<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio\"><div class=\"wp-block-embed__wrapper\">\n<div class=\"yt-consent-placeholder\" role=\"region\" aria-label=\"Video playback requires cookie consent\" data-video-id=\"uXyt_E2_myA\" data-poster=\"https:\/\/img.youtube.com\/vi\/uXyt_E2_myA\/maxresdefault.jpg\"><iframe aria-hidden=\"true\" tabindex=\"-1\" title=\"Kahani: Visual Storytelling through Culturally Nuanced Images\" width=\"500\" height=\"281\" data-src=\"https:\/\/www.youtube-nocookie.com\/embed\/uXyt_E2_myA?feature=oembed&rel=0&enablejsapi=1\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen><\/iframe><div class=\"yt-consent-placeholder__overlay\"><button class=\"yt-consent-placeholder__play\"><svg width=\"42\" height=\"42\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" aria-hidden=\"true\" focusable=\"false\"><g fill=\"none\" fill-rule=\"evenodd\"><circle fill=\"#000\" opacity=\".556\" cx=\"21\" cy=\"21\" r=\"21\"\/><path stroke=\"#FFF\" d=\"M27.5 22l-12 8.5v-17z\"\/><\/g><\/svg><span class=\"yt-consent-placeholder__label\">Video playback requires cookie consent<\/span><\/button><\/div><\/div>\n<\/div><\/figure>\n\n\n\n<div class=\"annotations \" data-bi-aN=\"margin-callout\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 annotations__list--right\">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t\t<a href=\"https:\/\/msrchat.azurewebsites.net\/?askmsr=Summarize%20the%20main%20three%20points%20of%20Sameer%27s%20talk\" target=\"_blank\" aria-label=\"Summarize the main three points of Sameer's talk\" data-bi-type=\"annotated-link\" data-bi-cN=\"Summarize the main three points of Sameer's talk\" class=\"annotations__list-thumbnail\" >\n\t\t\t\t\t<img loading=\"lazy\" decoding=\"async\" width=\"172\" height=\"96\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-240x135.png\" class=\"mb-2\" alt=\"Ask Microsoft research copilot experience\" srcset=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-240x135.png 240w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-300x169.png 300w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1024x576.png 1024w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-768x432.png 768w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1066x600.png 1066w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-655x368.png 655w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-343x193.png 343w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-640x360.png 640w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-960x540.png 960w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1280x720.png 1280w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo.png 1400w\" sizes=\"auto, (max-width: 172px) 100vw, 172px\" \/>\t\t\t\t<\/a>\n\t\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Microsoft research copilot experience<\/span>\n\t\t\t<a href=\"https:\/\/msrchat.azurewebsites.net\/?askmsr=Summarize%20the%20main%20three%20points%20of%20Sameer%27s%20talk\" data-bi-cN=\"Summarize the main three points of Sameer's talk\" target=\"_blank\" rel=\"noopener noreferrer\" data-external-link=\"true\" data-bi-aN=\"margin-callout\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Summarize the main three points of Sameer's talk<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-open-in-new-tab\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n\n\n<div class=\"wp-block-msr-show-more\">\n\t<div class=\"bg-neutral-100 p-5\">\n\t\t<div class=\"show-more-show-less\">\n\t\t\t<div>\n\t\t\t\t<span>\n\t\t\t\t\t\n\n<h3 class=\"wp-block-heading\" id=\"transcript-lightning-talk-5-kahani-visual-storytelling-through-culturally-nuanced-images\">Transcript &#8211; Lightning Talk 5: Kahani: Visual Storytelling through Culturally Nuanced Images<\/h3>\n\n\n\n<p><strong>Sameer Segal,<\/strong> Principal Research Software Development Engineer, Microsoft Research India&nbsp;<\/p>\n\n\n\n<p>Sameer Segal discusses Kahani, a research prototype that allows the user to create visually stunning and culturally nuanced images just by describing them in their local languages.&nbsp;<\/p>\n\n\n\n<p><em>Microsoft Research Forum<\/em>,<em> January 30, 2024<\/em>&nbsp;<\/p>\n\n\n\n<p><strong>S<\/strong><strong>AMEER SEGAL<\/strong><strong>:<\/strong> Hi, everyone. My name is Sameer Segal. I&#8217;m a principal research engineer at the Microsoft Research India lab. I&#8217;ve always been passionate about technology and societal impact. I was an entrepreneur for 10 years before I joined MSR (Microsoft Research), and it&#8217;s been absolutely wonderful the last couple of years that I&#8217;ve been here because I&#8217;m pursuing my passion at a whole new level of scale.<\/p>\n\n\n\n\t\t\t\t<\/span>\n\t\t\t\t<span id=\"show-more-show-less-toggle-1\" class=\"show-more-show-less-toggleable-content\">\n\t\t\t\t\t\n\n\n\n<p>I&#8217;m also a father to a 6-year-old daughter, and like most parents with kids this age, you spend a lot of time making up stories\u2014sometimes to teach important lessons like how to be kind and sometimes as well as just for fun. In India, we have a great repertoire of folktales, but unfortunately, they&#8217;re not visually appealing to the kids of today. With all these recent advancements in generative AI like large language models and diffusion models, wouldn&#8217;t it be great if we could create visual stories? That&#8217;s what our Project Kahani is trying to do. It&#8217;s trying to bring not only visually stunning images but also bring in cultural nuances to it.&nbsp;&nbsp;<\/p>\n\n\n\n<p>Past work has shown that diffusion models tend to stereotype and fail to understand local words, but they don&#8217;t provide ways to overcome these shortcomings without modifying the model or using fine-tuning in significant ways. The other big problem is that to get that perfect image, you need to do a lot of prompting, and sometimes if you use tools like Adobe Photoshop or use fine-tuning, this makes it out of the league of laypeople. And that&#8217;s really sad because these models were meant to be a force of democratization.&nbsp;&nbsp;<\/p>\n\n\n\n<p>Our <a href=\"https:\/\/www.microsoft.com\/en-us\/research\/project\/kahani\/\">project<\/a> started off at an internal hackathon a few months ago and has now evolved into a research project. Let me show you what we have built.&nbsp;&nbsp;<\/p>\n\n\n\n<p>I\u2019m going to paste a prompt inspired by a story that my daughter and I recently read. It\u2019s about Geetha, a girl who lives near a forest near BR Hills. And it\u2019s about her unexpected friendship with a butterfly and a firefly. And we want to emphasize about how important it is to be kind to your friends. So the system takes this instruction, and it tries to pick up the cultural nuances from this and generate a story. And then from there, it creates characters and scenes. And here is, you know, an example of what is done, right, so about Geetha, who meets a butterfly that&#8217;s stuck in a cobweb. If I&#8217;d like to add more to the story, I can make changes and just add new instructions. But if I&#8217;d like to add specific instructions, let&#8217;s say, on this particular slide &#8230; you know, in villages in India, we have something called as a Nazar Battu, which wards of evil. So what I can do is I can pull up the scene and just make a little hole here to place the object that I want, and I am going to give the system a reference image. I am going to tell it that this is a Nazar Battu. And let&#8217;s see what it does with this. [PAUSE] There you have it. It was pretty easy to get a word that the model doesn&#8217;t really understand right where we wanted in the context of our story.&nbsp;&nbsp;<\/p>\n\n\n\n<p>Let me show you how this was happening. From my prompt, we were able to extract these cultural elements. The large language models are especially good where they were able to understand BR Hills means that this is a place in India, southern India. And from these cultural nuances, we were able to create a specific prompt that was able to generate this character. And from this character, we were able to compose and create various scenes, right. Now it&#8217;s not perfect, but it&#8217;s a big step up from where we were with just the models. This work required us to do a series of benchmarking exercises where we tried out different prompts with names, visual descriptions, and definitions, and we would generate the image and compare that to a reference image that we got from a search engine. And GPT-4 [with] Vision was used as a judge to decide whether the image actually matched the reference image or not.&nbsp;<\/p>\n\n\n\n<p>We believe our work has tremendous potential. It can make local culture a lot more accessible, especially for image generation. And this can have application not just in storytelling and education but across domains.<\/p>\n\n\n\n<p>Thank you.<\/p>\n\n\t\t\t\t<\/span>\n\t\t\t<\/div>\n\t\t\t<button\n\t\t\t\tclass=\"action-trigger glyph-prepend mt-2 mb-0 show-more-show-less-toggle\"\n\t\t\t\taria-expanded=\"false\"\n\t\t\t\tdata-show-less-text=\"Show less\"\n\t\t\t\ttype=\"button\"\n\t\t\t\taria-controls=\"show-more-show-less-toggle-1\"\n\t\t\t\taria-label=\"Show more content\"\n\t\t\t\tdata-alternate-aria-label=\"Show less content\">\n\t\t\t\tShow more\t\t\t<\/button>\n\t\t<\/div>\n\t<\/div>\n<\/div>\n\n\n\n<h3 class=\"wp-block-heading alignwide\" id=\"related-resources\">Related resources<\/h3>\n\n\n\n<div class=\"wp-block-columns alignwide are-vertically-aligned-top is-layout-flex wp-container-core-columns-is-layout-9d6595d7 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\">\n<div class=\"annotations \" data-bi-aN=\"citation\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 \">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Project<\/span>\n\t\t\t<a href=\"https:\/\/www.microsoft.com\/en-us\/research\/project\/kahani\/\" data-bi-cN=\"Kahani\" data-external-link=\"false\" data-bi-aN=\"citation\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Kahani<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-chevron-right\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\">\n<div class=\"annotations \" data-bi-aN=\"citation\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 \">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Research Lab<\/span>\n\t\t\t<a href=\"https:\/\/www.microsoft.com\/en-us\/research\/lab\/microsoft-research-india\/\" data-bi-cN=\"Microsoft Research Lab \u2013 India\" data-external-link=\"false\" data-bi-aN=\"citation\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Microsoft Research Lab \u2013 India<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-chevron-right\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\"><\/div>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>Sameer Segal, Principal Research Software Development Engineer at Microsoft Research India, discusses Kahani, a research prototype that allows the user to create visually stunning and culturally nuanced images just by describing them in their local languages.<\/p>\n","protected":false},"author":42735,"featured_media":1002813,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-content-parent":999513,"msr_hide_image_in_river":0,"footnotes":""},"research-area":[],"msr-locale":[268875],"msr-post-option":[],"class_list":["post-999750","msr-blog-post","type-msr-blog-post","status-publish","has-post-thumbnail","hentry","msr-locale-en_us"],"msr_assoc_parent":{"id":999513,"type":"story"},"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/999750","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-blog-post"}],"author":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/users\/42735"}],"version-history":[{"count":11,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/999750\/revisions"}],"predecessor-version":[{"id":1045098,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/999750\/revisions\/1045098"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media\/1002813"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=999750"}],"wp:term":[{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=999750"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=999750"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=999750"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}