@inproceedings{axelrod2011domain, author = {Axelrod, Amittai and He, Xiaodong and Gao, Jianfeng}, title = {Domain Adaptation via Pseudo In-Domain Data Selection}, booktitle = {EMNLP}, year = {2011}, month = {July}, abstract = {We explore efficient domain adaptation for the task of statistical machine translation based on extracting sentences from a large general domain parallel corpus that are most relevant to the target domain. These sentences may be selected with simple cross-entropy based methods, of which we present three. As these sentences are not themselves identical to the in-domain data, we call them pseudo in-domain subcorpora. These subcorpora – 1% the size of the original – can then used to train small domain-adapted Statistical Machine Translation (SMT) systems which outperform systems trained on the entire corpus. Performance is further improved when we use these domain-adapted models in combination with a true in-domain model. The results show that more training data is not always better, and that best results are attained via proper domain-relevant data selection, as well as combining in- and general-domain systems during decoding.}, publisher = {ACM}, url = {http://approjects.co.za/?big=en-us/research/publication/domain-adaptation-via-pseudo-in-domain-data-selection/}, edition = {EMNLP}, }