@inproceedings{lyu2023hyrax, author = {Lyu, Jialun and You, Marisa and Irvene, Celine and Jung, Mark and Narmore, Tyler and Shapiro, Jacob and Marshall, Luke and Samal, Savyasachi and Manousakis, Ioannis and Hsu, Lisa and Subbarayalu, Preetha and Raniwala, Ashish and Warrier, Brijesh and Bianchini, Ricardo and Shroeder, Bianca and Berger, Daniel S.}, title = {Hyrax: Fail-in-Place Server Operation in Cloud Platforms}, organization = {USENIX}, booktitle = {Proceedings of the 17th Symposium on Operating Systems Design and Implementation (OSDI)}, year = {2023}, month = {July}, abstract = {Today’s cloud platforms handle server hardware failures by shutting down the affected server and only turning it back online once it has been repaired by a technician. At cloud scale, this all-or-nothing operating model is becoming increasingly unsustainable. This model is also at odds with technology trends, such as the need for new cooling technology.  This paper introduces Hyrax, a datacenter stack that enables compute servers with failed components to continue hosting VMs while hiding the underlying degraded capacity and performance. A key enabler of Hyrax is a novel model of changes in memory interleaving when deactivating faulty memory modules. Experiments on cloud production servers show that Hyrax overcomes common hardware failures without impacting peak VM performance. In large-scale simulations with production traces, Hyrax reduces server repair requirements by 50-60% without impacting VM scheduling.}, url = {http://approjects.co.za/?big=en-us/research/publication/hyrax-fail-in-place-server-operation-in-cloud-platforms/}, }