@misc{deschenaux2026language, author = {Deschenaux, Justin and Gulcehre, Caglar}, title = {Language Modeling with Hyperspherical Flows}, howpublished = {arXiv}, year = {2026}, month = {May}, abstract = {Discrete Diffusion Language Models progressed rapidly as an alternative to autoregressive (AR) models, motivated by their parallel generation abilities. However, for tractability, discrete diffusion models sample from a factorized distribution, which is less expressive than AR. Recent Flow Language Models (FLMs) apply continuous flows to language, transporting noise to data with a deterministic ODE that avoids factorized sampling. FLMs operate on one-hot vectors whose dimension scales with the vocabulary size, making FLMs costly to train. Moreover, since all distinct one-hot embeddings are equidistant in $ell_2$, adding Gaussian noise does not have a clear semantic interpretation (unlike images, where Gaussian noise progressively degrades structure). We introduce $mathbb[S]$-FLM, a latent FLM in the hypersphere. $mathbb[S]$-FLM generates sequences by rotating vectors in $mathbb[S]^[d-1]$ along a velocity field learned with cross-entropy, avoiding the overhead of materializing one-hot vectors. Previous FLMs match AR in Generative Perplexity (Gen. PPL), but samples with high likelihood are not necessarily correct in verifiable domains such as math and code. $mathbb[S]$-FLM substantially improves continuous flow language models on large-vocabulary reasoning and closes the gap to masked diffusion under standard-temperature sampling ($T=1$), while a gap remains under optimized low-temperature ($T=0.1$) decoding.}, url = {http://approjects.co.za/?big=en-us/research/publication/language-modeling-with-hyperspherical-flows/}, }