@inproceedings{arasu2006efficient, author = {Arasu, Arvind and Ganti, Venky and Kaushik, Raghav}, title = {Efficient Exact Set-Similarity Joins}, booktitle = {Proceedings of the 32nd International Conference on Very Large Data Bases, VLDB 2006}, year = {2006}, month = {August}, abstract = {Given two input collections of sets, a set-similarity join (SSJoin) identifies all pairs of sets, one from each collection, that have high similarity. Recent work has identified SSJoin as a useful primitive operator in data cleaning. In this paper, we propose new algorithms for SSJoin. Our algorithms have two important features: They are exact, i.e., they always produce the correct answer, and they carry precise performance guarantees. We believe our algorithms are the first to have both features; previous algorithms with performance guarantees are only probabilistically approximate. We demonstrate the effectiveness of our algorithms using a thorough experimental evaluation over real-life and synthetic data sets.}, publisher = {Very Large Data Bases Endowment Inc.}, url = {http://approjects.co.za/?big=en-us/research/publication/efficient-exact-set-similarity-joins/}, edition = {Proceedings of the 32nd International Conference on Very Large Data Bases, VLDB 2006}, }