@inproceedings{kim2013optimal, author = {Kim, Young-Bum and Snyder, Benjamin}, title = {Optimal Data Set Selection: An Application to Grapheme-to-Phoneme Conversion}, booktitle = {North American Association for Computational Linguistics (ACL)}, year = {2013}, month = {June}, abstract = {In this paper we introduce the task of unlabeled, optimal, data set selection. Given a large pool of unlabeled examples, our goal is to select a small subset to label, which will yield a high performance supervised model over the entire data set. Our first proposed method, based on the rank-revealing QR matrix factorization, selects a subset of words which span the entire word-space effectively. For our second method, we develop the concept of feature coverage which we optimize with a greedy algorithm. We apply these methods to the task of grapheme-to-phoneme prediction. Experiments over a data-set of 8 languages show that in all scenarios, our selection methods are effective at yielding a small, but optimal set of labelled examples. When fed into a state-of-the-art supervised model for grapheme-to-phoneme prediction, our methods yield average error reductions of 20% over randomly selected examples.}, publisher = {ACL - Association for Computational Linguistics}, url = {http://approjects.co.za/?big=en-us/research/publication/optimal-data-set-selection-an-application-to-grapheme-to-phoneme-conversion/}, edition = {North American Association for Computational Linguistics (ACL)}, }