@inproceedings{li2011b-bit, author = {Li, Ping and Shrivastava, Anshumali and Moore, Joshua and König, Arnd Christian}, title = {b-Bit Minwise Hashing for Large-Scale Learning}, booktitle = {Big Learning 2011: NIPS 2011 Workshop on Algorithms, Systems, and Tools for Learning at Scale}, year = {2011}, month = {December}, abstract = {Minwise hashing is a standard technique in the context of search for efficiently computing set similarities. The recent development of b-bit minwise hashing provides a substantial improvement by storing only the lowest b bits of each hashed value. In this paper, we demonstrate that b-bit minwise hashing can be naturally integrated with linear learning algorithms such as linear SVM and logistic regression, to solve large-scale and high-dimensional statistical learning tasks, especially when the data do not fit in memory. We compare b-bit minwise hashing with the Count-Min (CM) and Vowpal Wabbit (VW) algorithms, which have essentially the same variances as random projections. Our theoretical and empirical comparisons illustrate that b-bit minwise hashing is significantly more accurate (at the same storage cost) than VW (and random projections) for binary data.}, publisher = {Neural Information Processing Foundation}, url = {http://approjects.co.za/?big=en-us/research/publication/b-bit-minwise-hashing-for-large-scale-learning/}, edition = {Big Learning 2011: NIPS 2011 Workshop on Algorithms, Systems, and Tools for Learning at Scale}, }