@inproceedings{li2012gpu-based, author = {Li, Ping and Shrivastava, Anshumali and König, Arnd Christian}, title = {GPU-Based Minwise Hashing}, booktitle = {21st International World Wide Web Conference}, year = {2012}, month = {April}, abstract = {Minwise hashing is a standard technique for efficient set similarity estimation in the context of search. The recent work of b-bit minwise hashing provided a substantial improvement by storing only the lowest b bits of each hashed value. Both minwise hashing and b-bit minwise hashing require an expensive preprocessing step for applying k (e.g., k = 500) permutations on the entire data in order to compute k minimal values as the hashed data. In this paper, we developed a parallelization scheme using GPUs, which reduced the processing time by a factor of 20-80x. Reducing the preprocessing time is highly beneficial in practice, for example, for duplicate web page detection (where minwise hashing is a major step in the crawling pipeline) or for increasing the testing speed of online classifiers (when the test data are not preprocessed).}, publisher = {Association for Computing Machinery, Inc.}, url = {http://approjects.co.za/?big=en-us/research/publication/gpu-based-minwise-hashing/}, edition = {21st International World Wide Web Conference}, }