@inproceedings{axelrod2011domain,
author = {Axelrod, Amittai and He, Xiaodong and Gao, Jianfeng},
title = {Domain Adaptation via Pseudo In-Domain Data Selection},
booktitle = {EMNLP},
year = {2011},
month = {July},
abstract = {We explore efﬁcient domain adaptation for the task of statistical machine translation based on extracting sentences from a large general domain parallel corpus that are most relevant to the target domain. These sentences may be selected with simple cross-entropy based methods, of which we present three. As these sentences are not themselves identical to the in-domain data, we call them pseudo in-domain subcorpora. These subcorpora – 1% the size of the original – can then used to train small domain-adapted Statistical Machine Translation (SMT) systems which outperform systems trained on the entire corpus. Performance is further improved when we use these domain-adapted models in combination with a true in-domain model. The results show that more training data is not always better, and that best results are attained via proper domain-relevant data selection, as well as combining in- and general-domain systems during decoding.},
publisher = {ACM},
url = {http://approjects.co.za/?big=en-us/research/publication/domain-adaptation-via-pseudo-in-domain-data-selection/},
edition = {EMNLP},
}