@misc{wei2026learning,
author = {Wei, Hangxing and Chen, Xiaoyu and Zhang, Chuheng and Pearce, Tim and Chen, Jianyu and Lamb, Alex and Zhao, Li and Bian, Jiang},
title = {Learning Additively Compositional Latent Actions for Embodied AI},
howpublished = {arXiv},
year = {2026},
month = {April},
abstract = {Latent action learning infers pseudo-action labels from visual transitions, providing an approach to leverage internet-scale video for embodied AI. However, most methods learn latent actions without structural priors that encode the additive, compositional structure of physical motion. As a result, latents often entangle irrelevant scene details or information about future observations with true state changes and miscalibrate motion magnitude. We introduce Additively Compositional Latent Action Model (AC-LAM), which enforces scene-wise additive composition structure over short horizons on the latent action space. These AC constraints encourage simple algebraic structure in the latent action space~(identity, inverse, cycle consistency) and suppress information that does not compose additively. Empirically, AC-LAM learns more structured, motion-specific, and displacement-calibrated latent actions and provides stronger supervision for downstream policy learning, outperforming state-of-the-art LAMs across simulated and real-world tabletop tasks.},
url = {http://approjects.co.za/?big=en-us/research/publication/learning-additively-compositional-latent-actions-for-embodied-ai/},
}