[SPARK-47646][SQL] Make try_to_number return NULL for malformed input (apache#378)

wangyum · HyukjinKwon · GitHub Enterprise · commit 2e2875baefbd · 2024-06-05T09:02:39.000-05:00
### What changes were proposed in this pull request? This PR proposes to add NULL check after parsing the number so the output can be safely null for `try_to_number` expression. ```scala import org.apache.spark.sql.functions._ val df = spark.createDataset(spark.sparkContext.parallelize(Seq("11"))) df.select(try_to_number($"value", lit("$99.99"))).show() ``` ``` java.lang.NullPointerException: Cannot invoke "org.apache.spark.sql.types.Decimal.toPlainString()" because "<local7>" is null at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.serializefromobject_doConsume_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:894) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:894) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:368) at org.apache.spark.rdd.RDD.iterator(RDD.scala:332) ``` ### Why are the changes needed? To fix the bug, and let `try_to_number` return `NULL` for malformed input as designed. ### Does this PR introduce _any_ user-facing change? Yes, it fixes a bug. Previously, `try_to_number` failed with NPE. ### How was this patch tested? Unittest was added. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#45771 from HyukjinKwon/SPARK-47646. Authored-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit d709e20) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> Co-authored-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
@@ -86,6 +86,7 @@ abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Bo
         |${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};
         |if (!${ev.isNull}) {
         |  ${ev.value} = $builder.parse(${eval.value});
+        |  ${ev.isNull} = ${ev.isNull} || (${ev.value} == null);
         |}
       """.stripMargin)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -1173,6 +1173,11 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     checkAnswer(df.select(try_to_number(col("a"), lit("$99.99"))), Seq(Row(78.12)))
   }
 
+  test("SPARK-47646: try_to_number should return NULL for malformed input") {
+    val df = spark.createDataset(spark.sparkContext.parallelize(Seq("11")))
+    checkAnswer(df.select(try_to_number($"value", lit("$99.99"))), Seq(Row(null)))
+  }
+
   test("SPARK-44905: stateful lastRegex causes NullPointerException on eval for regexp_replace") {
     val df = sql("select regexp_replace('', '[a\\\\d]{0, 2}', 'x')")
     intercept[SparkRuntimeException](df.queryExecution.optimizedPlan)

Original file line number	Diff line number	Diff line change
`@@ -86,6 +86,7 @@ abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Bo`
`86`	`86`	`\|${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};`
`87`	`87`	`\|if (!${ev.isNull}) {`
`88`	`88`	`\| ${ev.value} = $builder.parse(${eval.value});`
	`89`	`+ \| ${ev.isNull} = ${ev.isNull} \|\| (${ev.value} == null);`
`89`	`90`	`\|}`
`90`	`91`	`""".stripMargin)`
`91`	`92`	`}`