apache · MaxGekk · Nov 8, 2018 · Nov 8, 2018 · Nov 8, 2018 · Nov 8, 2018
diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md
@@ -9,6 +9,8 @@ displayTitle: Spark SQL Upgrading Guide
 
 ## Upgrading From Spark SQL 2.4 to 3.0
 
+  - In Spark version 2.4 and earlier, accepted format of decimals parsed from CSV is an optional sign ('+' or '-'), followed by a sequence of zero or more decimal digits, optionally followed by a fraction, optionally followed by an exponent. Any commas were removed from the input before parsing. Since Spark 3.0, format varies and depends on locale which can be set via CSV option `locale`. The default locale is `en-US`. To switch back to previous behavior, set `spark.sql.legacy.decimalParsing.enabled` to `true`.
+
   - In PySpark, when creating a `SparkSession` with `SparkSession.builder.getOrCreate()`, if there is an existing `SparkContext`, the builder was trying to update the `SparkConf` of the existing `SparkContext` with configurations specified to the builder, but the `SparkContext` is shared by all `SparkSession`s, so we should not update them. Since 3.0, the builder comes to not update the configurations. This is the same behavior as Java/Scala API in 2.3 and above. If you want to update them, you need to update them prior to creating a `SparkSession`.
 
   - In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.csv
 
 import java.io.InputStream
 import java.math.BigDecimal
+import java.text.{DecimalFormat, DecimalFormatSymbols}
 
 import scala.util.Try
 import scala.util.control.NonFatal
@@ -29,6 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.catalyst.util.{BadRecordException, DateTimeUtils, FailureSafeParser}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -104,6 +106,14 @@ class UnivocityParser(
     requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray
   }
 
+  private val decimalParser = if (SQLConf.get.legacyDecimalParsing) {
+    (s: String) => new BigDecimal(s.replaceAll(",", ""))
+  } else {
+    val df = new DecimalFormat("", new DecimalFormatSymbols(options.locale))
+    df.setParseBigDecimal(true)
+    (s: String) => df.parse(s).asInstanceOf[BigDecimal]
+  }
+
   /**
    * Create a converter which converts the string value to a value according to a desired type.
    * Currently, we do not support complex types (`ArrayType`, `MapType`, `StructType`).
@@ -149,8 +159,7 @@ class UnivocityParser(
 
     case dt: DecimalType => (d: String) =>
       nullSafeDatum(d, name, nullable, options) { datum =>
-        val value = new BigDecimal(datum.replaceAll(",", ""))
-        Decimal(value, dt.precision, dt.scale)
+        Decimal(decimalParser(datum), dt.precision, dt.scale)
       }
 
     case _: TimestampType => (d: String) =>

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1594,6 +1594,13 @@ object SQLConf {
         "WHERE, which does not follow SQL standard.")
       .booleanConf
       .createWithDefault(false)
+
+  val LEGACY_DECIMAL_PARSING_ENABLED = buildConf("spark.sql.legacy.decimalParsing.enabled")
+    .doc("If it is set to false, it enables parsing decimals in locale specific formats. " +
+      "To switch back to previous behaviour when parsing was performed by java.math.BigDecimal " +
+      "and all commas were removed from the input, set the flag to true.")
+    .booleanConf
+    .createWithDefault(false)
 }
 
 /**
@@ -2009,6 +2016,8 @@ class SQLConf extends Serializable with Logging {
 
   def integralDivideReturnLong: Boolean = getConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG)
 
+  def legacyDecimalParsing: Boolean = getConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
@@ -18,13 +18,17 @@
 package org.apache.spark.sql.catalyst.csv
 
 import java.math.BigDecimal
+import java.text.{DecimalFormat, DecimalFormatSymbols}
+import java.util.Locale
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.plans.SQLHelper
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
-class UnivocityParserSuite extends SparkFunSuite {
+class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
   private val parser = new UnivocityParser(
     StructType(Seq.empty),
     new CSVOptions(Map.empty[String, String], false, "GMT"))
@@ -196,4 +200,34 @@ class UnivocityParserSuite extends SparkFunSuite {
     assert(doubleVal2 == Double.PositiveInfinity)
   }
 
+  test("parse decimals using locale") {
+    def checkDecimalParsing(langTag: String): Unit = {
+      val decimalVal = new BigDecimal("1000.001")
+      val decimalType = new DecimalType(10, 5)
+      val expected = Decimal(decimalVal, decimalType.precision, decimalType.scale)
+      val df = new DecimalFormat("", new DecimalFormatSymbols(Locale.forLanguageTag(langTag)))
+      val input = df.format(expected.toBigDecimal)
+
+      val options = new CSVOptions(Map("locale" -> langTag), false, "GMT")
+      val parser = new UnivocityParser(new StructType().add("d", decimalType), options)
+
+      assert(parser.makeConverter("_1", decimalType, options = options).apply(input) === expected)
+    }
+
+    withSQLConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED.key -> "false") {
+      Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalParsing)
+    }
+
+    withSQLConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED.key -> "true") {
+      Seq("en-US", "ko-KR").foreach(checkDecimalParsing)
+    }
+
+    withSQLConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED.key -> "true") {
+      Seq("ru-RU").foreach { langTag =>
+        intercept[NumberFormatException] {
+          checkDecimalParsing(langTag)
+        }
+      }
+    }
+  }
 }