@inproceedings{bai2023empowering, author = {Bai, Wei and Abdeen, Shanim Sainul and Agrawal, Ankit and Attre, Krishan Kumar and Bahl, Victor and , Ameya Bhagat and Bhaskara, Gowri and Brokhman, Tanya and Cao, Lei and Cheema, Ahmad and Chow, Rebecca and Cohen, Jeff and Elhaddad, Mahmoud and Ette, Vivek and Figlin, Igal and Firestone, Daniel and George, Mathew and German, Ilya and Ghai, Lakhmeet and Green, Eric and Greenberg, Albert and Gupta, Manish and Haagens, Randy and , Matthew Hendel and Howlader, Ridwan and John, Neetha and Johnstone, Julia and Jolly, Tom and Kramer, Greg and , David Kruse and Kumar, Ankit and Lan, Erica and Lee, Ivan and Levy, Avi and , Marina Lipshteyn and Liu, Xin and , Chen Liu and Lu, Guohan and Lu, Yuemin and Lu, Xiakun and Makhervaks, Vadim and Malashanka, Ulad and Maltz, Dave and Marinos, Ilias and Mehta, Rohan and Murthi, Sharda and Namdhari, Anup and Ogus, Aaron and Padhye, Jitu and Pandya, Madhav and Phillips, Douglas and Power, Adrian and Puri, Suraj and Raindel, Shachar and Rhee, Jordan and Russo, Anthony and Sah, Maneesh and Sheriff, Ali and Sparacino, Chris and Srivastava, Ashutosh and Sun, Weixiang and Swanson, Nick and Tian, Fuhou and Tomczyk, Lukasz and Vadlamuri, Vamsi and Wolman, Alec and Xie, Ying and Yom, Joyce and Yuan, Lihua and Zhang, Yanzhao and Zill, Brian}, title = {Empowering Azure Storage with RDMA}, organization = {USENIX}, booktitle = {USENIX Symposium on Networked Systems Design and Implementation (NSDI)}, year = {2023}, month = {April}, abstract = {Given the wide adoption of disaggregated storage in public clouds, networking is the key to enabling high performance and high reliability in a cloud storage service. In Azure, we choose Remote Direct Memory Access (RDMA) as our transport and aim to enable it for both storage frontend traffic (between compute virtual machines and storage clusters) and backend traffic (within a storage cluster) to fully realize its benefits. As compute and storage clusters may be located in different datacenters within an Azure region, we need to support RDMA at regional scale. This work presents our experience in deploying intra-region RDMA to support storage workloads in Azure. The high complexity and heterogeneity of our infrastructure bring a series of new challenges, such as the problem of interoperability between different types of RDMA network interface cards. We have made several changes to our network infrastructure to address these challenges. Today, around 70% of traffic in Azure is RDMA and intra-region RDMA is supported in all Azure public regions. RDMA helps us achieve significant disk I/O performance improvements and CPU core savings.}, url = {http://approjects.co.za/?big=en-us/research/publication/empowering-azure-storage-with-rdma/}, }