Skip to content

Commit 32e3cda

Browse files
yanboliangjkbradley
authored andcommitted
[SPARK-7604] [MLLIB] Python API for PCA and PCAModel
Python API for PCA and PCAModel Author: Yanbo Liang <[email protected]> Closes #6315 from yanboliang/spark-7604 and squashes the following commits: 1d58734 [Yanbo Liang] remove transform() in PCAModel, use default behavior 4d9d121 [Yanbo Liang] Python API for PCA and PCAModel
1 parent a1e3649 commit 32e3cda

File tree

2 files changed

+45
-0
lines changed

2 files changed

+45
-0
lines changed

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,16 @@ private[python] class PythonMLLibAPI extends Serializable {
519519
new ChiSqSelector(numTopFeatures).fit(data.rdd)
520520
}
521521

522+
/**
523+
* Java stub for PCA.fit(). This stub returns a
524+
* handle to the Java object instead of the content of the Java object.
525+
* Extra care needs to be taken in the Python code to ensure it gets freed on
526+
* exit; see the Py4J documentation.
527+
*/
528+
def fitPCA(k: Int, data: JavaRDD[Vector]): PCAModel = {
529+
new PCA(k).fit(data.rdd)
530+
}
531+
522532
/**
523533
* Java stub for IDF.fit(). This stub returns a
524534
* handle to the Java object instead of the content of the Java object.

python/pyspark/mllib/feature.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,41 @@ def fit(self, data):
252252
return ChiSqSelectorModel(jmodel)
253253

254254

255+
class PCAModel(JavaVectorTransformer):
256+
"""
257+
Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
258+
"""
259+
260+
261+
class PCA(object):
262+
"""
263+
A feature transformer that projects vectors to a low-dimensional space using PCA.
264+
265+
>>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),
266+
... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),
267+
... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
268+
>>> model = PCA(2).fit(sc.parallelize(data))
269+
>>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
270+
>>> pcArray[0]
271+
1.648...
272+
>>> pcArray[1]
273+
-4.013...
274+
"""
275+
def __init__(self, k):
276+
"""
277+
:param k: number of principal components.
278+
"""
279+
self.k = int(k)
280+
281+
def fit(self, data):
282+
"""
283+
Computes a [[PCAModel]] that contains the principal components of the input vectors.
284+
:param data: source vectors
285+
"""
286+
jmodel = callMLlibFunc("fitPCA", self.k, data)
287+
return PCAModel(jmodel)
288+
289+
255290
class HashingTF(object):
256291
"""
257292
.. note:: Experimental

0 commit comments

Comments
 (0)