Skip to content

Commit 4d9d121

Browse files
committed
Python API for PCA and PCAModel
1 parent ddec173 commit 4d9d121

File tree

2 files changed

+53
-0
lines changed

2 files changed

+53
-0
lines changed

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,16 @@ private[python] class PythonMLLibAPI extends Serializable {
518518
new ChiSqSelector(numTopFeatures).fit(data.rdd)
519519
}
520520

521+
/**
522+
* Java stub for PCA.fit(). This stub returns a
523+
* handle to the Java object instead of the content of the Java object.
524+
* Extra care needs to be taken in the Python code to ensure it gets freed on
525+
* exit; see the Py4J documentation.
526+
*/
527+
def fitPCA(k: Int, data: JavaRDD[Vector]): PCAModel = {
528+
new PCA(k).fit(data.rdd)
529+
}
530+
521531
/**
522532
* Java stub for IDF.fit(). This stub returns a
523533
* handle to the Java object instead of the content of the Java object.

python/pyspark/mllib/feature.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,49 @@ def fit(self, data):
254254
return ChiSqSelectorModel(jmodel)
255255

256256

257+
class PCAModel(JavaVectorTransformer):
258+
"""
259+
Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
260+
"""
261+
def transform(self, vector):
262+
"""
263+
Applies transformation on a vector.
264+
265+
:param vector: Vector or RDD of Vector to be transformed.
266+
:return: transformed vector.
267+
"""
268+
return JavaVectorTransformer.transform(self, vector)
269+
270+
271+
class PCA(object):
272+
"""
273+
A feature transformer that projects vectors to a low-dimensional space using PCA.
274+
275+
>>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),
276+
... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),
277+
... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
278+
>>> model = PCA(2).fit(sc.parallelize(data))
279+
>>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
280+
>>> pcArray[0]
281+
1.648...
282+
>>> pcArray[1]
283+
-4.013...
284+
"""
285+
def __init__(self, k):
286+
"""
287+
:param k: number of principal components.
288+
"""
289+
self.k = int(k)
290+
291+
def fit(self, data):
292+
"""
293+
Computes a [[PCAModel]] that contains the principal components of the input vectors.
294+
:param data: source vectors
295+
"""
296+
jmodel = callMLlibFunc("fitPCA", self.k, data)
297+
return PCAModel(jmodel)
298+
299+
257300
class HashingTF(object):
258301
"""
259302
.. note:: Experimental

0 commit comments

Comments
 (0)