diff --git a/docs/ml-features.md b/docs/ml-features.md
index 72643137d96b..f1acb3aea7e9 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -775,35 +775,43 @@ for more details on the API.
-## OneHotEncoder
+## OneHotEncoder (Deprecated since 2.3.0)
-[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features.
+Because this existing `OneHotEncoder` is a stateless transformer, it is not usable on new data where the number of categories may differ from the training data. In order to fix this, a new `OneHotEncoderEstimator` was created that produces an `OneHotEncoderModel` when fitting. For more detail, please see [SPARK-13030](https://issues.apache.org/jira/browse/SPARK-13030).
+
+`OneHotEncoder` has been deprecated in 2.3.0 and will be removed in 3.0.0. Please use [OneHotEncoderEstimator](ml-features.html#onehotencoderestimator) instead.
+
+## OneHotEncoderEstimator
+
+[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value indicating the presence of a specific feature value from among the set of all feature values. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features. For string type input data, it is common to encode categorical features using [StringIndexer](ml-features.html#stringindexer) first.
+
+`OneHotEncoderEstimator` can transform multiple columns, returning an one-hot-encoded output vector column for each input column. It is common to merge these vectors into a single feature vector using [VectorAssembler](ml-features.html#vectorassembler).
+
+`OneHotEncoderEstimator` supports the `handleInvalid` parameter to choose how to handle invalid input during transforming data. Available options include 'keep' (any invalid inputs are assigned to an extra categorical index) and 'error' (throw an error).
**Examples**
-Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
-for more details on the API.
+Refer to the [OneHotEncoderEstimator Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoderEstimator) for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala %}
+{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala %}
-Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
+Refer to the [OneHotEncoderEstimator Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoderEstimator.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java %}
+{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java %}
-Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
-for more details on the API.
+Refer to the [OneHotEncoderEstimator Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoderEstimator) for more details on the API.
-{% include_example python/ml/onehot_encoder_example.py %}
+{% include_example python/ml/onehot_encoder_estimator_example.py %}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
similarity index 62%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
rename to examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
index 99af37676ba9..6f93cff94b72 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
@@ -23,9 +23,8 @@
import java.util.Arrays;
import java.util.List;
-import org.apache.spark.ml.feature.OneHotEncoder;
-import org.apache.spark.ml.feature.StringIndexer;
-import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.ml.feature.OneHotEncoderEstimator;
+import org.apache.spark.ml.feature.OneHotEncoderModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -35,41 +34,37 @@
import org.apache.spark.sql.types.StructType;
// $example off$
-public class JavaOneHotEncoderExample {
+public class JavaOneHotEncoderEstimatorExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
- .appName("JavaOneHotEncoderExample")
+ .appName("JavaOneHotEncoderEstimatorExample")
.getOrCreate();
+ // Note: categorical features are usually first encoded with StringIndexer
// $example on$
List data = Arrays.asList(
- RowFactory.create(0, "a"),
- RowFactory.create(1, "b"),
- RowFactory.create(2, "c"),
- RowFactory.create(3, "a"),
- RowFactory.create(4, "a"),
- RowFactory.create(5, "c")
+ RowFactory.create(0.0, 1.0),
+ RowFactory.create(1.0, 0.0),
+ RowFactory.create(2.0, 1.0),
+ RowFactory.create(0.0, 2.0),
+ RowFactory.create(0.0, 1.0),
+ RowFactory.create(2.0, 0.0)
);
StructType schema = new StructType(new StructField[]{
- new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
- new StructField("category", DataTypes.StringType, false, Metadata.empty())
+ new StructField("categoryIndex1", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("categoryIndex2", DataTypes.DoubleType, false, Metadata.empty())
});
Dataset df = spark.createDataFrame(data, schema);
- StringIndexerModel indexer = new StringIndexer()
- .setInputCol("category")
- .setOutputCol("categoryIndex")
- .fit(df);
- Dataset indexed = indexer.transform(df);
+ OneHotEncoderEstimator encoder = new OneHotEncoderEstimator()
+ .setInputCols(new String[] {"categoryIndex1", "categoryIndex2"})
+ .setOutputCols(new String[] {"categoryVec1", "categoryVec2"});
- OneHotEncoder encoder = new OneHotEncoder()
- .setInputCol("categoryIndex")
- .setOutputCol("categoryVec");
-
- Dataset encoded = encoder.transform(indexed);
+ OneHotEncoderModel model = encoder.fit(df);
+ Dataset encoded = model.transform(df);
encoded.show();
// $example off$
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_estimator_example.py
similarity index 65%
rename from examples/src/main/python/ml/onehot_encoder_example.py
rename to examples/src/main/python/ml/onehot_encoder_estimator_example.py
index e1996c7f0a55..2723e681cea7 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_estimator_example.py
@@ -18,32 +18,31 @@
from __future__ import print_function
# $example on$
-from pyspark.ml.feature import OneHotEncoder, StringIndexer
+from pyspark.ml.feature import OneHotEncoderEstimator
# $example off$
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("OneHotEncoderExample")\
+ .appName("OneHotEncoderEstimatorExample")\
.getOrCreate()
+ # Note: categorical features are usually first encoded with StringIndexer
# $example on$
df = spark.createDataFrame([
- (0, "a"),
- (1, "b"),
- (2, "c"),
- (3, "a"),
- (4, "a"),
- (5, "c")
- ], ["id", "category"])
+ (0.0, 1.0),
+ (1.0, 0.0),
+ (2.0, 1.0),
+ (0.0, 2.0),
+ (0.0, 1.0),
+ (2.0, 0.0)
+ ], ["categoryIndex1", "categoryIndex2"])
- stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
- model = stringIndexer.fit(df)
- indexed = model.transform(df)
-
- encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
- encoded = encoder.transform(indexed)
+ encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
+ outputCols=["categoryVec1", "categoryVec2"])
+ model = encoder.fit(df)
+ encoded = model.transform(df)
encoded.show()
# $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
similarity index 65%
rename from examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
index 274cc1268f4d..45d816808ed8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
@@ -19,38 +19,34 @@
package org.apache.spark.examples.ml
// $example on$
-import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+import org.apache.spark.ml.feature.OneHotEncoderEstimator
// $example off$
import org.apache.spark.sql.SparkSession
-object OneHotEncoderExample {
+object OneHotEncoderEstimatorExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
- .appName("OneHotEncoderExample")
+ .appName("OneHotEncoderEstimatorExample")
.getOrCreate()
+ // Note: categorical features are usually first encoded with StringIndexer
// $example on$
val df = spark.createDataFrame(Seq(
- (0, "a"),
- (1, "b"),
- (2, "c"),
- (3, "a"),
- (4, "a"),
- (5, "c")
- )).toDF("id", "category")
-
- val indexer = new StringIndexer()
- .setInputCol("category")
- .setOutputCol("categoryIndex")
- .fit(df)
- val indexed = indexer.transform(df)
-
- val encoder = new OneHotEncoder()
- .setInputCol("categoryIndex")
- .setOutputCol("categoryVec")
-
- val encoded = encoder.transform(indexed)
+ (0.0, 1.0),
+ (1.0, 0.0),
+ (2.0, 1.0),
+ (0.0, 2.0),
+ (0.0, 1.0),
+ (2.0, 0.0)
+ )).toDF("categoryIndex1", "categoryIndex2")
+
+ val encoder = new OneHotEncoderEstimator()
+ .setInputCols(Array("categoryIndex1", "categoryIndex2"))
+ .setOutputCols(Array("categoryVec1", "categoryVec2"))
+ val model = encoder.fit(df)
+
+ val encoded = model.transform(df)
encoded.show()
// $example off$