Post-hoc Attention Steering for LLMs (PASTA)<\/a>, focuses on better controllability. These approaches are designed to be resource-efficient, as they do not require tuning or back propagation. Looking ahead, our goal is to further develop these techniques to improve the resource-efficiency of LLM applications, making them more accessible to a wider audience. <\/p>\n","protected":false},"excerpt":{"rendered":"LLMs rely on memory-intensive mechanisms like the key-value (KV) cache to store and quickly retrieve data. FastGen optimizes KV cache usage, reducing LLM memory demands by up to 50% while maintaining performance.<\/p>\n","protected":false},"author":37583,"featured_media":1030227,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","footnotes":""},"categories":[1],"tags":[],"research-area":[13556],"msr-region":[],"msr-event-type":[],"msr-locale":[268875],"msr-post-option":[243984],"msr-impact-theme":[],"msr-promo-type":[],"msr-podcast-series":[],"class_list":["post-1030206","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-research-blog","msr-research-area-artificial-intelligence","msr-locale-en_us","msr-post-option-blog-homepage-featured"],"msr_event_details":{"start":"","end":"","location":""},"podcast_url":"","podcast_episode":"","msr_research_lab":[],"msr_impact_theme":[],"related-publications":[],"related-downloads":[],"related-videos":[],"related-academic-programs":[],"related-groups":[],"related-projects":[],"related-events":[1014303],"related-researchers":[{"type":"user_nicename","value":"Liyuan Liu","user_id":43302,"display_name":"Liyuan Liu","author_link":"Liyuan Liu<\/a>","is_active":false,"last_first":"Liu, Liyuan","people_section":0,"alias":"lucliu"},{"type":"user_nicename","value":"Jianfeng Gao","user_id":32246,"display_name":"Jianfeng Gao","author_link":"Jianfeng Gao<\/a>","is_active":false,"last_first":"Gao, Jianfeng","people_section":0,"alias":"jfgao"}],"msr_type":"Post","featured_image_thumbnail":"","byline":"Liyuan Liu<\/a> and Jianfeng Gao<\/a>","formattedDate":"May 8, 2024","formattedExcerpt":"LLMs rely on memory-intensive mechanisms like the key-value (KV) cache to store and quickly retrieve data. FastGen optimizes KV cache usage, reducing LLM memory demands by up to 50% while maintaining performance.","locale":{"slug":"en_us","name":"English","native":"","english":"English"},"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/posts\/1030206"}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/users\/37583"}],"replies":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/comments?post=1030206"}],"version-history":[{"count":31,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/posts\/1030206\/revisions"}],"predecessor-version":[{"id":1032420,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/posts\/1030206\/revisions\/1032420"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media\/1030227"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=1030206"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/categories?post=1030206"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/tags?post=1030206"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=1030206"},{"taxonomy":"msr-region","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-region?post=1030206"},{"taxonomy":"msr-event-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-event-type?post=1030206"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=1030206"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=1030206"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=1030206"},{"taxonomy":"msr-promo-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-promo-type?post=1030206"},{"taxonomy":"msr-podcast-series","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-podcast-series?post=1030206"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}