@article{ch-wang2023do,
author = {CH-Wang, Sky and Van Durme, Ben and Eisner, Jason and Kedzie, Chris},
title = {Do Androids Know They're Only Dreaming of Electric Sheep?},
year = {2023},
month = {December},
abstract = {We design probes trained on the internal representations of a transformer language model that are predictive of its hallucinatory behavior on in-context generation tasks. To facilitate this detection, we create a span-annotated dataset of organic and synthetic hallucinations over several tasks. We find that probes trained on the force-decoded states of synthetic hallucinations are generally ecologically invalid in organic hallucination detection. Furthermore, hidden state information about hallucination appears to be task and distribution-dependent. Intrinsic and extrinsic hallucination saliency varies across layers, hidden state types, and tasks; notably, extrinsic hallucinations tend to be more salient in a transformer's internal representations. Outperforming multiple contemporary baselines, we show that probing is a feasible and efficient alternative to language model hallucination evaluation when model states are available.},
url = {http://approjects.co.za/?big=en-us/research/publication/do-androids-know-theyre-only-dreaming-of-electric-sheep/},
journal = {arXiv: Computation and Language},
}