Revert "[SPARK-25040][SQL] Empty string for non string types should be disallowed" (apache#528)

robert3005 · bulldozer-bot[bot] · commit cf7ab0ac3f1b · 2019-04-03T02:20:13.000Z
diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md
@@ -13,8 +13,6 @@ displayTitle: Spark SQL Upgrading Guide
 
   - In PySpark, when creating a `SparkSession` with `SparkSession.builder.getOrCreate()`, if there is an existing `SparkContext`, the builder was trying to update the `SparkConf` of the existing `SparkContext` with configurations specified to the builder, but the `SparkContext` is shared by all `SparkSession`s, so we should not update them. Since 3.0, the builder comes to not update the configurations. This is the same behavior as Java/Scala API in 2.3 and above. If you want to update them, you need to update them prior to creating a `SparkSession`.
 
-  - In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`.
-
   - Since Spark 3.0, the `from_json` functions supports two modes - `PERMISSIVE` and `FAILFAST`. The modes can be set via the `mode` option. The default mode became `PERMISSIVE`. In previous versions, behavior of `from_json` did not conform to either `PERMISSIVE` nor `FAILFAST`, especially in processing of malformed JSON records. For example, the JSON string `{"a" 1}` with the schema `a INT` is converted to `null` by previous versions but Spark 3.0 converts it to `Row(null)`.
 
   - The `ADD JAR` command previously returned a result set with the single value 0. It now returns an empty result set.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -26,7 +26,7 @@ import scala.util.control.NonFatal
 
 import com.fasterxml.jackson.core._
 
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.SafeLogging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util._
@@ -40,7 +40,7 @@ import org.apache.spark.util.Utils
 class JacksonParser(
     schema: DataType,
     val options: JSONOptions,
-    allowArrayAsStructs: Boolean) extends Logging {
+    allowArrayAsStructs: Boolean) extends SafeLogging {
 
   import JacksonUtils._
   import com.fasterxml.jackson.core.JsonToken._
@@ -174,7 +174,7 @@ class JacksonParser(
         case VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT =>
           parser.getFloatValue
 
-        case VALUE_STRING if parser.getTextLength >= 1 =>
+        case VALUE_STRING =>
           // Special case handling for NaN and Infinity.
           parser.getText match {
             case "NaN" => Float.NaN
@@ -190,7 +190,7 @@ class JacksonParser(
         case VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT =>
           parser.getDoubleValue
 
-        case VALUE_STRING if parser.getTextLength >= 1 =>
+        case VALUE_STRING =>
           // Special case handling for NaN and Infinity.
           parser.getText match {
             case "NaN" => Double.NaN
@@ -217,7 +217,7 @@ class JacksonParser(
 
     case TimestampType =>
       (parser: JsonParser) => parseJsonToken[java.lang.Long](parser, dataType) {
-        case VALUE_STRING if parser.getTextLength >= 1 =>
+        case VALUE_STRING =>
           val stringValue = parser.getText
           // This one will lose microseconds parts.
           // See https://issues.apache.org/jira/browse/SPARK-10681.
@@ -236,7 +236,7 @@ class JacksonParser(
 
     case DateType =>
       (parser: JsonParser) => parseJsonToken[java.lang.Integer](parser, dataType) {
-        case VALUE_STRING if parser.getTextLength >= 1 =>
+        case VALUE_STRING =>
           val stringValue = parser.getText
           // This one will lose microseconds parts.
           // See https://issues.apache.org/jira/browse/SPARK-10681.x
@@ -319,17 +319,18 @@ class JacksonParser(
   }
 
   /**
-   * This function throws an exception for failed conversion. For empty string on data types
-   * except for string and binary types, this also throws an exception.
+   * This function throws an exception for failed conversion, but returns null for empty string,
+   * to guard the non string types.
    */
   private def failedConversion[R >: Null](
       parser: JsonParser,
       dataType: DataType): PartialFunction[JsonToken, R] = {
-
-    // SPARK-25040: Disallow empty strings for data types except for string and binary types.
     case VALUE_STRING if parser.getTextLength < 1 =>
-      throw new RuntimeException(
-        s"Failed to parse an empty string for data type ${dataType.catalogString}")
+      safeLogInfo("Would have dropped empty record. " +
+        "Likely you have empty strings for nested values")
+      // If conversion is failed, this produces `null` rather than throwing exception.
+      // This will protect the mismatch of types.
+      null
 
     case token =>
       // We cannot parse this token based on the given data type. So, we throw a
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2515,41 +2515,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     countForMalformedJSON(0, Seq(""))
   }
 
-  test("SPARK-25040: empty strings should be disallowed") {
-    def failedOnEmptyString(dataType: DataType): Unit = {
-       val df = spark.read.schema(s"a ${dataType.catalogString}")
-        .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS)
-      val errMessage = intercept[SparkException] {
-        df.collect()
-      }.getMessage
-      assert(errMessage.contains(
-        s"Failed to parse an empty string for data type ${dataType.catalogString}"))
-    }
-
-    def emptyString(dataType: DataType, expected: Any): Unit = {
-      val df = spark.read.schema(s"a ${dataType.catalogString}")
-        .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS)
-      checkAnswer(df, Row(expected) :: Nil)
-    }
-
-    failedOnEmptyString(BooleanType)
-    failedOnEmptyString(ByteType)
-    failedOnEmptyString(ShortType)
-    failedOnEmptyString(IntegerType)
-    failedOnEmptyString(LongType)
-    failedOnEmptyString(FloatType)
-    failedOnEmptyString(DoubleType)
-    failedOnEmptyString(DecimalType.SYSTEM_DEFAULT)
-    failedOnEmptyString(TimestampType)
-    failedOnEmptyString(DateType)
-    failedOnEmptyString(ArrayType(IntegerType))
-    failedOnEmptyString(MapType(StringType, IntegerType, true))
-    failedOnEmptyString(StructType(StructField("f1", IntegerType, true) :: Nil))
-
-    emptyString(StringType, "")
-    emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8))
-  }
-
   test("do not produce empty files for empty partitions") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath