From f385367c9d0baa8263ecea89c31deecd304ab44f Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 17 May 2016 10:32:03 -0700 Subject: [PATCH] remove ml.LabeledPoint from PySpark and annotate ml.LabeledPoint in Python --- .../spark/ml/feature/LabeledPoint.scala | 5 ++- .../mllib/api/python/PythonMLLibAPI.scala | 19 ----------- python/pyspark/ml/feature.py | 33 +------------------ python/pyspark/ml/tests.py | 4 +-- 4 files changed, 7 insertions(+), 54 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala index 6cb515b784f5c..f7f1d42039599 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.feature import scala.beans.BeanInfo +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.Vector /** @@ -27,8 +28,10 @@ import org.apache.spark.ml.linalg.Vector * @param label Label for this data point. * @param features List of features for this data point. */ +@Since("2.0.0") +@Experimental @BeanInfo -case class LabeledPoint(label: Double, features: Vector) { +case class LabeledPoint(@Since("2.0.0") label: Double, @Since("2.0.0") features: Vector) { override def toString: String = { s"($label,$features)" } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 7c379be62a5cd..90d382753131d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1623,24 +1623,6 @@ private[spark] object SerDe extends Serializable { } } - // Pickler for ML LabeledPoint - private[python] class MLLabeledPointPickler extends BasePickler[MLLabeledPoint] { - - override protected def packageName = PYSPARK_ML_PACKAGE - - def saveState(obj: Object, out: OutputStream, pickler: Pickler): Unit = { - val point: MLLabeledPoint = obj.asInstanceOf[MLLabeledPoint] - saveObjects(out, pickler, point.label, point.features) - } - - def construct(args: Array[Object]): Object = { - if (args.length != 2) { - throw new PickleException("should be 2") - } - new MLLabeledPoint(args(0).asInstanceOf[Double], args(1).asInstanceOf[NewVector]) - } - } - // Pickler for Rating private[python] class RatingPickler extends BasePickler[Rating] { @@ -1684,7 +1666,6 @@ private[spark] object SerDe extends Serializable { new NewSparseMatrixPickler().register() new NewSparseVectorPickler().register() new LabeledPointPickler().register() - new MLLabeledPointPickler().register() new RatingPickler().register() initialized = true } diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 493a163ae2cf4..983b6a5301ae1 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -29,8 +29,7 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm from pyspark.mllib.common import inherit_doc -__all__ = ['LabeledPoint', - 'Binarizer', +__all__ = ['Binarizer', 'Bucketizer', 'ChiSqSelector', 'ChiSqSelectorModel', 'CountVectorizer', 'CountVectorizerModel', @@ -60,36 +59,6 @@ 'Word2Vec', 'Word2VecModel'] -class LabeledPoint(object): - - """ - Class that represents the features and labels of a data point. - - :param label: - Label for this data point. - :param features: - Vector of features for this point (NumPy array, list, - pyspark.ml.linalg.SparseVector, or scipy.sparse column matrix). - - Note: 'label' and 'features' are accessible as class attributes. - - .. versionadded:: 1.0.0 - """ - - def __init__(self, label, features): - self.label = float(label) - self.features = _convert_to_vector(features) - - def __reduce__(self): - return (LabeledPoint, (self.label, self.features)) - - def __str__(self): - return "(" + ",".join((str(self.label), str(self.features))) + ")" - - def __repr__(self): - return "LabeledPoint(%s, %s)" % (self.label, self.features) - - @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 1a5f1081aef6f..e3511120bdecb 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1409,8 +1409,8 @@ def test_serialization(self): self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): - rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), - LabeledPoint(0.0, self.sv1)]) + rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1), + Row(label=0.0, features=self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0]