Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,6 @@ def __hash__(self):
"pyspark.mllib.evaluation",
"pyspark.mllib.feature",
"pyspark.mllib.fpm",
"pyspark.mllib.linalg.__init__",
"pyspark.mllib.linalg.distributed",
"pyspark.mllib.random",
"pyspark.mllib.recommendation",
Expand All @@ -406,6 +405,7 @@ def __hash__(self):
"pyspark.ml.feature",
"pyspark.ml.classification",
"pyspark.ml.clustering",
"pyspark.ml.linalg.__init__",
"pyspark.ml.recommendation",
"pyspark.ml.regression",
"pyspark.ml.tuning",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1207,15 +1207,18 @@ private[python] class PythonMLLibAPI extends Serializable {
private[spark] object SerDe extends Serializable {

val PYSPARK_PACKAGE = "pyspark.mllib"
val PYSPARK_ML_PACKAGE = "pyspark.ml"

/**
* Base class used for pickle
*/
private[python] abstract class BasePickler[T: ClassTag]
extends IObjectPickler with IObjectConstructor {

protected def packageName: String = PYSPARK_PACKAGE

private val cls = implicitly[ClassTag[T]].runtimeClass
private val module = PYSPARK_PACKAGE + "." + cls.getName.split('.')(4)
private val module = packageName + "." + cls.getName.split('.')(4)
private val name = cls.getSimpleName

// register this to Pickler and Unpickler
Expand Down Expand Up @@ -1265,6 +1268,8 @@ private[spark] object SerDe extends Serializable {
// Pickler for DenseVector
private[python] class DenseVectorPickler extends BasePickler[DenseVector] {

override protected def packageName = PYSPARK_ML_PACKAGE

def saveState(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
val vector: DenseVector = obj.asInstanceOf[DenseVector]
val bytes = new Array[Byte](8 * vector.size)
Expand Down Expand Up @@ -1297,6 +1302,8 @@ private[spark] object SerDe extends Serializable {
// Pickler for DenseMatrix
private[python] class DenseMatrixPickler extends BasePickler[DenseMatrix] {

override protected def packageName = PYSPARK_ML_PACKAGE

def saveState(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
val m: DenseMatrix = obj.asInstanceOf[DenseMatrix]
val bytes = new Array[Byte](8 * m.values.length)
Expand Down Expand Up @@ -1334,6 +1341,8 @@ private[spark] object SerDe extends Serializable {
// Pickler for SparseMatrix
private[python] class SparseMatrixPickler extends BasePickler[SparseMatrix] {

override protected def packageName = PYSPARK_ML_PACKAGE

def saveState(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
val s = obj.asInstanceOf[SparseMatrix]
val order = ByteOrder.nativeOrder()
Expand Down Expand Up @@ -1389,6 +1398,8 @@ private[spark] object SerDe extends Serializable {
// Pickler for SparseVector
private[python] class SparseVectorPickler extends BasePickler[SparseVector] {

override protected def packageName = PYSPARK_ML_PACKAGE

def saveState(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
val v: SparseVector = obj.asInstanceOf[SparseVector]
val n = v.indices.length
Expand Down Expand Up @@ -1431,6 +1442,8 @@ private[spark] object SerDe extends Serializable {
// Pickler for LabeledPoint
private[python] class LabeledPointPickler extends BasePickler[LabeledPoint] {

override protected def packageName = PYSPARK_PACKAGE

def saveState(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
val point: LabeledPoint = obj.asInstanceOf[LabeledPoint]
saveObjects(out, pickler, point.label, point.features)
Expand All @@ -1447,6 +1460,8 @@ private[spark] object SerDe extends Serializable {
// Pickler for Rating
private[python] class RatingPickler extends BasePickler[Rating] {

override protected def packageName = PYSPARK_PACKAGE

def saveState(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
val rating: Rating = obj.asInstanceOf[Rating]
saveObjects(out, pickler, rating.user, rating.product, rating.rating)
Expand Down
14 changes: 7 additions & 7 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
Currently, this class only supports binary classification.

>>> from pyspark.sql import Row
>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sc.parallelize([
... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF()
Expand Down Expand Up @@ -496,7 +496,7 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
It supports both binary and multiclass labels, as well as both continuous and categorical
features.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> from pyspark.ml.feature import StringIndexer
>>> df = sqlContext.createDataFrame([
... (1.0, Vectors.dense(1.0)),
Expand Down Expand Up @@ -625,7 +625,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred

>>> import numpy
>>> from numpy import allclose
>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> from pyspark.ml.feature import StringIndexer
>>> df = sqlContext.createDataFrame([
... (1.0, Vectors.dense(1.0)),
Expand Down Expand Up @@ -752,7 +752,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
`SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_

>>> from numpy import allclose
>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> from pyspark.ml.feature import StringIndexer
>>> df = sqlContext.createDataFrame([
... (1.0, Vectors.dense(1.0)),
Expand Down Expand Up @@ -884,7 +884,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
The input feature values must be nonnegative.

>>> from pyspark.sql import Row
>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([
... Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
... Row(label=0.0, features=Vectors.dense([0.0, 1.0])),
Expand Down Expand Up @@ -1023,7 +1023,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
Number of inputs has to be equal to the size of feature vectors.
Number of outputs has to be equal to the total number of labels.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([
... (0.0, Vectors.dense([0.0, 0.0])),
... (1.0, Vectors.dense([0.0, 1.0])),
Expand Down Expand Up @@ -1188,7 +1188,7 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable):
is picked to label the example.

>>> from pyspark.sql import Row
>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sc.parallelize([
... Row(label=0.0, features=Vectors.dense(1.0, 0.8)),
... Row(label=1.0, features=Vectors.sparse(2, [], [])),
Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte

GaussianMixture clustering.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors

>>> data = [(Vectors.dense([-0.1, -0.05 ]),),
... (Vectors.dense([-0.01, -0.1]),),
Expand Down Expand Up @@ -194,7 +194,7 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
K-means clustering with a k-means++ like initialization mode
(the k-means|| algorithm by Bahmani et al).

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
>>> df = sqlContext.createDataFrame(data, ["features"])
Expand Down Expand Up @@ -347,7 +347,7 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
If bisecting all divisible clusters on the bottom level would result more than `k` leaf
clusters, larger clusters get higher priority.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
>>> df = sqlContext.createDataFrame(data, ["features"])
Expand Down Expand Up @@ -625,7 +625,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter
:py:class:`pyspark.ml.feature.Tokenizer` and :py:class:`pyspark.ml.feature.CountVectorizer`
can be useful for converting text to word count vectors.

>>> from pyspark.mllib.linalg import Vectors, SparseVector
>>> from pyspark.ml.linalg import Vectors, SparseVector
>>> from pyspark.ml.clustering import LDA
>>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])],
... [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
... [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
>>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"])
Expand Down
26 changes: 13 additions & 13 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@

from pyspark import since, keyword_only
from pyspark.rdd import ignore_unicode_prefix
from pyspark.ml.linalg import _convert_to_vector
from pyspark.ml.param.shared import *
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm
from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector

__all__ = ['Binarizer',
'Bucketizer',
Expand Down Expand Up @@ -380,7 +380,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
.. seealso:: `More information on Wikipedia \
<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
>>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
>>> df2 = dct.transform(df1)
Expand Down Expand Up @@ -447,7 +447,7 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada
with a provided "weight" vector. In other words, it scales each column of the dataset
by a scalar multiplier.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])
>>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
... inputCol="values", outputCol="eprod")
Expand Down Expand Up @@ -582,7 +582,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab

Compute the Inverse Document Frequency (IDF) given a collection of documents.

>>> from pyspark.mllib.linalg import DenseVector
>>> from pyspark.ml.linalg import DenseVector
>>> df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),),
... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])
>>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
Expand Down Expand Up @@ -670,7 +670,7 @@ class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav
absolute value in each feature. It does not shift/center the data, and thus does not destroy
any sparsity.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])
>>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled")
>>> model = maScaler.fit(df)
Expand Down Expand Up @@ -757,7 +757,7 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav
Note that since zero values will probably be transformed to non-zero values, output of the
transformer will be DenseVector even for sparse input.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
>>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled")
>>> model = mmScaler.fit(df)
Expand Down Expand Up @@ -961,7 +961,7 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav

Normalize a vector to have unit norm using the given p-norm.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})
>>> df = sqlContext.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"])
>>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features")
Expand Down Expand Up @@ -1114,7 +1114,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead
multiplication distributes over addition". Take a 2-variable feature vector as an example:
`(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"])
>>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded")
>>> px.transform(df).head().expanded
Expand Down Expand Up @@ -1459,7 +1459,7 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J
Standardizes features by removing the mean and scaling to unit variance using column summary
statistics on the samples in the training set.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
>>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")
>>> model = standardScaler.fit(df)
Expand Down Expand Up @@ -1942,7 +1942,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
- Add warning if a categorical feature has only 1 category.
- Add option for allowing unknown categories.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([(Vectors.dense([-1.0, 0.0]),),
... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"])
>>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed")
Expand Down Expand Up @@ -2062,7 +2062,7 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J
The output vector will order features with the selected indices first (in the order given),
followed by the selected names (in the order given).

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame([
... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
Expand Down Expand Up @@ -2329,7 +2329,7 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab

PCA trains a model to project vectors to a low-dimensional space using PCA.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
Expand Down Expand Up @@ -2547,7 +2547,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
Chi-Squared feature selection, which selects categorical features to use for predicting a
categorical label.

>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.linalg import Vectors
>>> df = sqlContext.createDataFrame(
... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
Expand Down
Loading