@inproceedings{kobren2017an, author = {Kobren, Ari and Krishnamurthy, Akshay and Monath, Nicholas and McCallum, Andrew}, title = {An online hierarchical algorithm for extreme clustering}, booktitle = {Knowledge Discovery and Data Mining}, year = {2017}, month = {August}, abstract = {Many modern clustering methods scale well to a large number of data items, N, but not to a large number of clusters, K. This paper introduces PERCH, a new non-greedy algorithm for online hierarchical clustering that scales to both massive N and K--a problem setting we term extreme clustering. Our algorithm efficiently routes new data points to the leaves of an incrementally-built tree. Motivated by the desire for both accuracy and speed, our approach performs tree rotations for the sake of enhancing subtree purity and encouraging balancedness. We prove that, under a natural separability assumption, our non-greedy algorithm will produce trees with perfect dendrogram purity regardless of online data arrival order. Our experiments demonstrate that PERCH constructs more accurate trees than other tree-building clustering algorithms and scales well with both N and K, achieving a higher quality clustering than the strongest flat clustering competitor in nearly half the time.}, url = {http://approjects.co.za/?big=en-us/research/publication/an-online-hierarchical-algorithm-for-extreme-clustering/}, }