From 184a3cd5a2ac879a7a95009c4ef67eb3cc979d44 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 18 Aug 2015 10:08:55 +0800 Subject: [PATCH 1/6] add vector slicer doc, with java/python suite --- docs/ml-features.md | 138 ++++++++++++++++++ .../ml/feature/JavaVectorSlicerSuite.java | 85 +++++++++++ python/pyspark/ml/feature.py | 73 +++++++++ 3 files changed, 296 insertions(+) create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java diff --git a/docs/ml-features.md b/docs/ml-features.md index cec2cbe673407..3725ce3ef90ea 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1389,3 +1389,141 @@ print(output.select("features", "clicked").first()) # Feature Selectors +## VectorSlicer + +`VectorSlicer` is a transformer that takes a feature vector and outputs a new feature vector with a sub-array of the original features. It is useful for extracting features from a vector column. + +`VectorSlicer` accepts a vector column with a specified indices, then outputs a new vector column whose values are selected via those indices. There are two types of indices, + + 1. Integer indices that represents the real indices in the vector, `setIndices()`; + + 2. String indices that represents the names of features in the vector, `setNames()`. + +Specify by integer and string are both acceptable, moreover, you can use integer index and string name simultaneously. At least one feature must be selected. Duplicate features are not allowed, so there can be no overlap between selected indices and names. Note that if names of features are selected, an exception will be threw out when encountering with empty input attributes. + +The output vector will order features with the selected indices first (in the order given), followed by the selected names (in the order given). + +**Examples** + +Suppose that we have a DataFrame with the column `userFeatures`: + +~~~ + userFeatures +------------------ + [0.0, 10.0, 0.5] +~~~ + +`userFeatures` is a vector column that contains three user features. Assuming that the first column of `userFeatures` are all zeros, so we want to remove it and only the last two columns are selected. The `VectorSlicer` selects the last two elements with `setIndices(1, 2)` then produces a new vector column named `features`: + +~~~ + userFeatures | features +------------------|----------------------------- + [0.0, 10.0, 0.5] | [10.0, 0.5] +~~~ + +Suppose also that we have a potential input attributes for the `userFeatures`, i.e. `["f1", "f2", "f3"]`, then we can use `setNames("f2", "f3")` to select them. + +~~~ + userFeatures | features +------------------|----------------------------- + [0.0, 10.0, 0.5] | [10.0, 0.5] + ["f1", "f2", "f3"] | ["f2", "f3"] +~~~ + +
+
+ +[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input column name with specified indices or names and an output column name. + +{% highlight scala %} +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} +import org.apache.spark.ml.feature.VectorSlicer +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + +val data = Array( + Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), + Vectors.dense(-2.0, 2.3, 0.0) +) + +val defaultAttr = NumericAttribute.defaultAttr +val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) +val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) + +val dataRDD = sc.parallelize(data).map(Row.apply) +val dataset = sqlContext.createDataFrame(dataRDD, StructType(attrGroup.toStructField())) + +val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") + +slicer.setIndices(1).setNames("f3") +// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) + +val output = slicer.transform(dataset) +println(output.select("userFeatures", "features").first()) +{% endhighlight %} +
+ +
+ +[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name with specified indices or names and an output column name. + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; +import static org.apache.spark.sql.types.DataTypes.*; + +Attribute[] attrs = new Attribute[]{ + NumericAttribute.defaultAttr().withName("f1"), + NumericAttribute.defaultAttr().withName("f2"), + NumericAttribute.defaultAttr().withName("f3") +}; +AttributeGroup group = new AttributeGroup("userFeatures", attrs); + +JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( + RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), + RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) +)); + +DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); + +VectorSlicer vectorSlicer = new VectorSlicer() + .setInputCol("userFeatures").setOutputCol("features"); + +vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); +// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) + +DataFrame output = vectorSlicer.transform(dataset); + +System.out.println(output.select("userFeatures", "features").first()); +{% endhighlight %} +
+ +
+ +[`VectorSlicer`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSlicer) takes an input column name with specified indices or names and an output column name. + +{% highlight python %} +from pyspark.mllib.linalg import Vectors +from pyspark.ml.feature import VectorAssembler + +dataset = sqlContext.createDataFrame( + [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], + ["id", "hour", "mobile", "userFeatures", "clicked"]) +assembler = VectorAssembler( + inputCols=["hour", "mobile", "userFeatures"], + outputCol="features") +output = assembler.transform(dataset) +print(output.select("features", "clicked").first()) +{% endhighlight %} +
+
+ + diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java new file mode 100644 index 0000000000000..56988b9fb29cb --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature; + +import com.google.common.collect.Lists; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.attribute.Attribute; +import org.apache.spark.ml.attribute.AttributeGroup; +import org.apache.spark.ml.attribute.NumericAttribute; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.StructType; + + +public class JavaVectorSlicerSuite { + private transient JavaSparkContext jsc; + private transient SQLContext jsql; + + @Before + public void setUp() { + jsc = new JavaSparkContext("local", "JavaVectorSlicerSuite"); + jsql = new SQLContext(jsc); + } + + @After + public void tearDown() { + jsc.stop(); + jsc = null; + } + + @Test + public void vectorSlice() { + Attribute[] attrs = new Attribute[]{ + NumericAttribute.defaultAttr().withName("f1"), + NumericAttribute.defaultAttr().withName("f2"), + NumericAttribute.defaultAttr().withName("f3") + }; + AttributeGroup group = new AttributeGroup("userFeatures", attrs); + + JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( + RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), + RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) + )); + + DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); + + VectorSlicer vectorSlicer = new VectorSlicer() + .setInputCol("userFeatures").setOutputCol("features"); + + vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); + + DataFrame output = vectorSlicer.transform(dataset); + + for (Row r : output.select("userFeatures", "features").take(2)) { + Vector features = r.getAs(1); + Assert.assertEquals(features.size(), 2); + } + } +} diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 535d55326646c..09828090fd2eb 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -33,6 +33,79 @@ 'PCAModel', 'RFormula', 'RFormulaModel'] +@inherit_doc +class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): + """ + Slice a vector column given indices or names. + + >>> from pyspark.mllib.linalg import DenseVector + >>> from pyspark.mllib.linalg import SparseVector + >>> df = sqlContext.createDataFrame([(SparseVector(3, {0: -2.0, 1: 2.3}),), + ... (DenseVector([-2.0, 2.3, 0.0]),)], ["userFeatures"]) + >>> vectorSlicer = VectorSlicer(indices=[1, 2], inputCol="userFeatures", outputCol="features") + >>> vectorSlicer.transform(df).head().features + ??? + """ + + # a placeholder to make it appear in the generated doc + indices = Param(Params._dummy(), "indices", + "An array of indices to select features from a vector column. There can be no overlap with names.") + names = Param(Params._dummy(), "names", + "An array of feature names to select features from a vector column. There can be no overlap with indices.") + + + @keyword_only + def __init__(self, indices=[], names=[], inputCol=None, outputCol=None): + """ + __init__(self, indices=[], names=[], inputCol=None, outputCol=None) + """ + super(VectorSlicer, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) + self.indices = Param(self, "indices", + "An array of indices to select features from a vector column. There can be no overlap with names.") + self._setDefault(indices=[]) + self.names = Param(self, "names", + "An array of feature names to select features from a vector column. There can be no overlap with indices.") + self._setDefault(names=[]) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + + @keyword_only + def setParams(self, indices=[], names=[], inputCol=None, outputCol=None): + """ + setParams(self, indices=[], names=[], inputCol=None, outputCol=None) + Sets params for this VectorSlicer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setIndices(self, value): + """ + Sets the value of :py:attr:`indices`. + """ + self._paramMap[self.indices] = value + return self + + def getIndices(self): + """ + Gets the value of indices or its default value. + """ + return self.getOrDefault(self.indices) + + def setNames(self, value): + """ + Sets the value of :py:attr:`names`. + """ + self._paramMap[self.names] = value + return self + + def getNames(self): + """ + Gets the value of names or its default value. + """ + return self.getOrDefault(self.names) + @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """ From 61092cc579032da7974b3358b7197212175d1437 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 18 Aug 2015 15:12:53 +0800 Subject: [PATCH 2/6] add python version --- docs/ml-features.md | 26 +++++++++++++++----------- python/pyspark/ml/feature.py | 14 +++++++++----- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 3725ce3ef90ea..8d36ec106571f 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1430,6 +1430,10 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i ["f1", "f2", "f3"] | ["f2", "f3"] ~~~ +**NOTE** + +`VectorSlicer` of Python version does not supprt selecting by names currently. +
@@ -1472,7 +1476,6 @@ println(output.select("userFeatures", "features").first()) import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; @@ -1511,17 +1514,18 @@ System.out.println(output.select("userFeatures", "features").first()); [`VectorSlicer`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSlicer) takes an input column name with specified indices or names and an output column name. {% highlight python %} -from pyspark.mllib.linalg import Vectors -from pyspark.ml.feature import VectorAssembler +from pyspark.mllib.linalg import DenseVector +from pyspark.mllib.linalg import SparseVector +from pyspark.ml.feature import VectorSlicer -dataset = sqlContext.createDataFrame( - [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], - ["id", "hour", "mobile", "userFeatures", "clicked"]) -assembler = VectorAssembler( - inputCols=["hour", "mobile", "userFeatures"], - outputCol="features") -output = assembler.transform(dataset) -print(output.select("features", "clicked").first()) +dataset = sqlContext.createDataFrame([(SparseVector(3, {0: -2.0, 1: 2.3}),), + (DenseVector([-2.0, 2.3, 0.0]),)], ["userFeatures"]) + +vectorSlicer = VectorSlicer(indices=[1, 2], inputCol="userFeatures", outputCol="features") + +output = vectorSlicer.transform(dataset) + +print(output.select("userFeatures", "features").first()) {% endhighlight %}
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 09828090fd2eb..9d4e653c15a5e 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -44,14 +44,16 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): ... (DenseVector([-2.0, 2.3, 0.0]),)], ["userFeatures"]) >>> vectorSlicer = VectorSlicer(indices=[1, 2], inputCol="userFeatures", outputCol="features") >>> vectorSlicer.transform(df).head().features - ??? + SparseVector(2, {0: 2.3}) """ # a placeholder to make it appear in the generated doc indices = Param(Params._dummy(), "indices", - "An array of indices to select features from a vector column. There can be no overlap with names.") + "An array of indices to select features from a vector column." + + " There can be no overlap with names.") names = Param(Params._dummy(), "names", - "An array of feature names to select features from a vector column. There can be no overlap with indices.") + "An array of feature names to select features from a vector column." + + " There can be no overlap with indices.") @keyword_only @@ -62,10 +64,12 @@ def __init__(self, indices=[], names=[], inputCol=None, outputCol=None): super(VectorSlicer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) self.indices = Param(self, "indices", - "An array of indices to select features from a vector column. There can be no overlap with names.") + "An array of indices to select features from a vector column." + + " There can be no overlap with names.") self._setDefault(indices=[]) self.names = Param(self, "names", - "An array of feature names to select features from a vector column. There can be no overlap with indices.") + "An array of feature names to select features from a vector column." + + " There can be no overlap with indices.") self._setDefault(names=[]) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) From 3ff4b5deb70e2676f1f645a7b358b0646db30c0c Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 18 Aug 2015 16:07:57 +0800 Subject: [PATCH 3/6] fix python style error --- python/pyspark/ml/feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 9d4e653c15a5e..50d471da282cc 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -30,7 +30,7 @@ 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA', - 'PCAModel', 'RFormula', 'RFormulaModel'] + 'PCAModel', 'RFormula', 'RFormulaModel', 'VectorSlicer'] @inherit_doc @@ -110,6 +110,7 @@ def getNames(self): """ return self.getOrDefault(self.names) + @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """ From d5cdabd445c070c2ab492f3b58a3d3505f724a61 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 18 Aug 2015 16:53:54 +0800 Subject: [PATCH 4/6] remove extra blank lines --- python/pyspark/ml/feature.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 50d471da282cc..b7f6d7e0663ad 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -55,7 +55,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): "An array of feature names to select features from a vector column." + " There can be no overlap with indices.") - @keyword_only def __init__(self, indices=[], names=[], inputCol=None, outputCol=None): """ @@ -74,7 +73,6 @@ def __init__(self, indices=[], names=[], inputCol=None, outputCol=None): kwargs = self.__init__._input_kwargs self.setParams(**kwargs) - @keyword_only def setParams(self, indices=[], names=[], inputCol=None, outputCol=None): """ From e85e60f912b370b7ed1af7f0f3b653c64c880358 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 19 Aug 2015 19:32:34 +0800 Subject: [PATCH 5/6] fix style errors --- docs/ml-features.md | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 1a704c8ed21d0..e55513c3c06e7 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1477,17 +1477,25 @@ print(output.select("features", "clicked").first()) ## VectorSlicer -`VectorSlicer` is a transformer that takes a feature vector and outputs a new feature vector with a sub-array of the original features. It is useful for extracting features from a vector column. +`VectorSlicer` is a transformer that takes a feature vector and outputs a new feature vector with a +sub-array of the original features. It is useful for extracting features from a vector column. -`VectorSlicer` accepts a vector column with a specified indices, then outputs a new vector column whose values are selected via those indices. There are two types of indices, +`VectorSlicer` accepts a vector column with a specified indices, then outputs a new vector column +whose values are selected via those indices. There are two types of indices, - 1. Integer indices that represents the real indices in the vector, `setIndices()`; + 1. Integer indices that represents the indices into the vector, `setIndices()`; - 2. String indices that represents the names of features in the vector, `setNames()`. + 2. String indices that represents the names of features into the vector, `setNames()`. + *This requires the vector column to have an `AttributeGroup` since the implementation matches on + the name field of an `Attribute`.* -Specify by integer and string are both acceptable, moreover, you can use integer index and string name simultaneously. At least one feature must be selected. Duplicate features are not allowed, so there can be no overlap between selected indices and names. Note that if names of features are selected, an exception will be threw out when encountering with empty input attributes. +Specification by integer and string are both acceptable. Moreover, you can use integer index and +string name simultaneously. At least one feature must be selected. Duplicate features are not +allowed, so there can be no overlap between selected indices and names. Note that if names of +features are selected, an exception will be threw out when encountering with empty input attributes. -The output vector will order features with the selected indices first (in the order given), followed by the selected names (in the order given). +The output vector will order features with the selected indices first (in the order given), +followed by the selected names (in the order given). **Examples** @@ -1499,7 +1507,10 @@ Suppose that we have a DataFrame with the column `userFeatures`: [0.0, 10.0, 0.5] ~~~ -`userFeatures` is a vector column that contains three user features. Assuming that the first column of `userFeatures` are all zeros, so we want to remove it and only the last two columns are selected. The `VectorSlicer` selects the last two elements with `setIndices(1, 2)` then produces a new vector column named `features`: +`userFeatures` is a vector column that contains three user features. Assuming that the first column +of `userFeatures` are all zeros, so we want to remove it and only the last two columns are selected. +The `VectorSlicer` selects the last two elements with `setIndices(1, 2)` then produces a new vector +column named `features`: ~~~ userFeatures | features @@ -1507,7 +1518,8 @@ Suppose that we have a DataFrame with the column `userFeatures`: [0.0, 10.0, 0.5] | [10.0, 0.5] ~~~ -Suppose also that we have a potential input attributes for the `userFeatures`, i.e. `["f1", "f2", "f3"]`, then we can use `setNames("f2", "f3")` to select them. +Suppose also that we have a potential input attributes for the `userFeatures`, i.e. +`["f1", "f2", "f3"]`, then we can use `setNames("f2", "f3")` to select them. ~~~ userFeatures | features @@ -1518,12 +1530,13 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i **NOTE** -`VectorSlicer` of Python version does not supprt selecting by names currently. +The Python version of `VectorSlicer` does not support selecting by names currently.
-[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input column name with specified indices or names and an output column name. +[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input +column name with specified indices or names and an output column name. {% highlight scala %} import org.apache.spark.mllib.linalg.Vectors @@ -1556,7 +1569,8 @@ println(output.select("userFeatures", "features").first())
-[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name with specified indices or names and an output column name. +[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name +with specified indices or names and an output column name. {% highlight java %} import java.util.Arrays; @@ -1597,7 +1611,8 @@ System.out.println(output.select("userFeatures", "features").first());
-[`VectorSlicer`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSlicer) takes an input column name with specified indices or names and an output column name. +[`VectorSlicer`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSlicer) takes an input column +name with specified indices or names and an output column name. {% highlight python %} from pyspark.mllib.linalg import DenseVector From f8f0d61dd4985581971b499321efe34972a72f1b Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 21 Aug 2015 10:55:31 +0800 Subject: [PATCH 6/6] remove python API --- docs/ml-features.md | 25 ------------ python/pyspark/ml/feature.py | 78 +----------------------------------- 2 files changed, 1 insertion(+), 102 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 2a0f21dcd4023..642a4b4c53183 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1532,10 +1532,6 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i ["f1", "f2", "f3"] | ["f2", "f3"] ~~~ -**NOTE** - -The Python version of `VectorSlicer` does not support selecting by names currently. -
@@ -1612,27 +1608,6 @@ DataFrame output = vectorSlicer.transform(dataset); System.out.println(output.select("userFeatures", "features").first()); {% endhighlight %}
- -
- -[`VectorSlicer`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSlicer) takes an input column -name with specified indices or names and an output column name. - -{% highlight python %} -from pyspark.mllib.linalg import DenseVector -from pyspark.mllib.linalg import SparseVector -from pyspark.ml.feature import VectorSlicer - -dataset = sqlContext.createDataFrame([(SparseVector(3, {0: -2.0, 1: 2.3}),), - (DenseVector([-2.0, 2.3, 0.0]),)], ["userFeatures"]) - -vectorSlicer = VectorSlicer(indices=[1, 2], inputCol="userFeatures", outputCol="features") - -output = vectorSlicer.transform(dataset) - -print(output.select("userFeatures", "features").first()) -{% endhighlight %} -
## RFormula diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 40de69eff5c69..04b2b2ccc9e55 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -30,83 +30,7 @@ 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', - 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'VectorSlicer'] - - -@inherit_doc -class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): - """ - Slice a vector column given indices or names. - - >>> from pyspark.mllib.linalg import DenseVector - >>> from pyspark.mllib.linalg import SparseVector - >>> df = sqlContext.createDataFrame([(SparseVector(3, {0: -2.0, 1: 2.3}),), - ... (DenseVector([-2.0, 2.3, 0.0]),)], ["userFeatures"]) - >>> vectorSlicer = VectorSlicer(indices=[1, 2], inputCol="userFeatures", outputCol="features") - >>> vectorSlicer.transform(df).head().features - SparseVector(2, {0: 2.3}) - """ - - # a placeholder to make it appear in the generated doc - indices = Param(Params._dummy(), "indices", - "An array of indices to select features from a vector column." + - " There can be no overlap with names.") - names = Param(Params._dummy(), "names", - "An array of feature names to select features from a vector column." + - " There can be no overlap with indices.") - - @keyword_only - def __init__(self, indices=[], names=[], inputCol=None, outputCol=None): - """ - __init__(self, indices=[], names=[], inputCol=None, outputCol=None) - """ - super(VectorSlicer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) - self.indices = Param(self, "indices", - "An array of indices to select features from a vector column." + - " There can be no overlap with names.") - self._setDefault(indices=[]) - self.names = Param(self, "names", - "An array of feature names to select features from a vector column." + - " There can be no overlap with indices.") - self._setDefault(names=[]) - kwargs = self.__init__._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, indices=[], names=[], inputCol=None, outputCol=None): - """ - setParams(self, indices=[], names=[], inputCol=None, outputCol=None) - Sets params for this VectorSlicer. - """ - kwargs = self.setParams._input_kwargs - return self._set(**kwargs) - - def setIndices(self, value): - """ - Sets the value of :py:attr:`indices`. - """ - self._paramMap[self.indices] = value - return self - - def getIndices(self): - """ - Gets the value of indices or its default value. - """ - return self.getOrDefault(self.indices) - - def setNames(self, value): - """ - Sets the value of :py:attr:`names`. - """ - self._paramMap[self.names] = value - return self - - def getNames(self): - """ - Gets the value of names or its default value. - """ - return self.getOrDefault(self.names) + 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] @inherit_doc