@techreport{kumar2014distributed, author = {Kumar, Arun and Narayanan, Vijay and Karampatziakis, Nikos and Mineiro, Paul and Weimer, Markus}, title = {Distributed and Scalable PCA in the Cloud}, institution = {Microsoft}, year = {2014}, month = {January}, abstract = {Principal Component Analysis (CA) is a popular technique with many applications. Recent randomized PCA algorithms scale to large datasets but face a bottleneck when the number of features is also large. We propose to mitigate this issue using a composition of structured and unstructured randomness within a randomized PCA algorithm. Initial experiments using a large graph dataset from Twitter show promising results. We demonstrate the scalability of our algorithm by implementing it both on Hadoop, and a more flexible platform named REEF.}, url = {http://approjects.co.za/?big=en-us/research/publication/distributed-and-scalable-pca-in-the-cloud/}, number = {MSR-TR-2014-165}, note = {Apache Reef Research Paper}, }