From 13e0245f11f0199807f3c5342cdd6d6aa1e1a92a Mon Sep 17 00:00:00 2001 From: Wojtek Szymanski Date: Sun, 5 Mar 2017 15:52:52 +0100 Subject: [PATCH 1/2] Bucketizer.handleInvalid docs improved --- .../org/apache/spark/ml/feature/Bucketizer.scala | 13 ++++++++----- python/pyspark/ml/feature.py | 4 +++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index d1f3b2af1e48..db662308279d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -78,16 +78,19 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Param for how to handle invalid entries. Options are 'skip' (filter out rows with - * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special - * additional bucket). + * Param for how to handle invalid entries containing either NaN or null values. + * Values outside the splits will always be treated as errors. + * Options are 'skip' (filter out rows with invalid values), 'error' (throw an error), + * or 'keep' (keep invalid values in a special additional bucket). * Default: "error" * @group param */ // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid. @Since("2.1.0") - val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " + - "invalid entries. Options are skip (filter out rows with invalid values), " + + val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", + "how to handle invalid entries containing either NaN or null values. " + + "Values outside the splits will always be treated as errors. " + + "Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 92f8549e9cb9..af8c3daa4d24 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -356,7 +356,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav "splits specified will be treated as errors.", typeConverter=TypeConverters.toListFloat) - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " + + handleInvalid = Param(Params._dummy(), "handleInvalid", + "how to handle invalid entries containing either NaN or null values. " + + "Values outside the splits will always be treated as errors. " + "Options are 'skip' (filter out rows with invalid values), " + "'error' (throw an error), or 'keep' (keep invalid values in a special " + "additional bucket).", From ca6e9577f16e453abd25e8011db0146394822ef3 Mon Sep 17 00:00:00 2001 From: Wojtek Szymanski Date: Mon, 6 Mar 2017 22:58:03 +0100 Subject: [PATCH 2/2] Bucketizer docs update - nulls not supported in handleInvalid --- .../main/scala/org/apache/spark/ml/feature/Bucketizer.scala | 4 ++-- python/pyspark/ml/feature.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index db662308279d..07d44a9dca0c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -78,7 +78,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Param for how to handle invalid entries containing either NaN or null values. + * Param for how to handle invalid entries containing NaN values. * Values outside the splits will always be treated as errors. * Options are 'skip' (filter out rows with invalid values), 'error' (throw an error), * or 'keep' (keep invalid values in a special additional bucket). @@ -88,7 +88,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid. @Since("2.1.0") val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", - "how to handle invalid entries containing either NaN or null values. " + + "how to handle invalid entries containing NaN values. " + "Values outside the splits will always be treated as errors. " + "Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index af8c3daa4d24..5aaeb7c4decf 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -357,7 +357,7 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav typeConverter=TypeConverters.toListFloat) handleInvalid = Param(Params._dummy(), "handleInvalid", - "how to handle invalid entries containing either NaN or null values. " + + "how to handle invalid entries containing NaN values. " + "Values outside the splits will always be treated as errors. " + "Options are 'skip' (filter out rows with invalid values), " + "'error' (throw an error), or 'keep' (keep invalid values in a special " +