@inproceedings{li2013b-bit,
author = {Li, Ping and Shrivastava, Anshumali and König, Arnd Christian},
title = {b-Bit Minwise Hashing in Practice},
booktitle = {Proceedings of the 5th Asia-Pacific Symposium on Internetware},
year = {2013},
month = {October},
abstract = {Minwise hashing is a standard technique in the context of search for approximating set similarities. The recent work [26, 32] demonstrated a potential use of b-bit minwise hashing [23, 24] for efficient search and learning on massive, high-dimensional, binary data (which are typical for many applications in Web search and text mining). In this paper, we focus on a number of critical issues which must be addressed before one can apply b-bit minwise hashing to the volumes of data often used industrial applications.

Minwise hashing requires an expensive preprocessing step that computes k (e.g., 500) minimal values after applying the corresponding permutations for each data vector. We developed a parallelization scheme using GPUs and observed that the preprocessing time can be reduced by a factor of 20 - 80 and becomes substantially smaller than the data loading time. Reducing the preprocessing time is highly beneficial in practice, e.g., for duplicate Web page detection (where minwise hashing is a major step in the crawling pipeline) or for increasing the testing speed of online classifiers.

Another critical issue is that for very large data sets it becomes impossible to store a (fully) random permutation matrix, due to its space requirements. Our paper is the first study to demonstrate that b-bit minwise hashing implemented using simple hash functions, e.g., the 2-universal (2U) and 4-universal (4U) hash families, can produce very similar learning results as using fully random permutations. Experiments on datasets of up to 200GB are presented.},
publisher = {ACM},
url = {http://approjects.co.za/?big=en-us/research/publication/b-bit-minwise-hashing-in-practice/},
edition = {Proceedings of the 5th Asia-Pacific Symposium on Internetware},
}