@inproceedings{dolan2005automatically, author = {Dolan, Bill and Brockett, Chris}, title = {Automatically Constructing a Corpus of Sentential Paraphrases}, booktitle = {Third International Workshop on Paraphrasing (IWP2005)}, year = {2005}, month = {January}, abstract = {An obstacle to research in automatic paraphrase identification and generation is the lack of large-scale, publiclyavailable labeled corpora of sentential paraphrases. This paper describes the creation of the recently-released MicrosoftResearch Paraphrase Corpus, which contains 5801 sentence pairs, each hand-labeled with a binary judgment as to whether the pair constitutes a paraphrase. The corpus was created using heuristic extraction techniques in conjunction with an SVM-based classifier to select likely sentence-level paraphrases from a large corpus of topicclustered news data. These pairs were then submitted to human judges, who confirmed that 67% were in fact semantically equivalent. In addition to describing the corpus itself, we explore a number of issues that arose in defining guidelines for the human raters.}, publisher = {Asia Federation of Natural Language Processing}, url = {http://approjects.co.za/?big=en-us/research/publication/automatically-constructing-a-corpus-of-sentential-paraphrases/}, edition = {Third International Workshop on Paraphrasing (IWP2005)}, }