@inproceedings{pado2008evaluating, author = {Pado, Sebastian and Galley, Michel and Jurafsky, Dan and Manning, Chris}, title = {Evaluating MT output with entailment technology}, booktitle = {AMTA workshop: Metrics MATR - Metrics for Machine Translation Challenge}, year = {2008}, month = {August}, abstract = {Constant evaluation is vital to the progress of machine translation. However, human evaluation is costly, time-consuming, and difficult to do reliably. On the other hand, automatic measures of machine evaluation performance (such as BLEU, NIST, TER, and METEOR), while cheap and objective, have increasingly come under suspicion as to whether they are satisfactory measuring instruments. Recent work (e.g., Callison- Burch et al. (2006)) has demonstrated that for current state-of-the-art MT systems, the correlation between BLEU scores and human adequacy and fluency ratings is often low; BLEU scores tend to favor statistical over rule-based systems; and BLEU-like measures tend to perform worse at the segment level than at the corpus level.}, url = {http://approjects.co.za/?big=en-us/research/publication/evaluating-mt-output-entailment-technology/}, edition = {AMTA workshop: Metrics MATR - Metrics for Machine Translation Challenge}, }