Skip to content
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c567dcc
Add a test
MaxGekk Nov 8, 2018
2b41eba
Fix decimal parsing
MaxGekk Nov 8, 2018
cf438ae
Add locale option
MaxGekk Nov 8, 2018
f9438c4
Updating the migration guide
MaxGekk Nov 8, 2018
3125c23
Fix imports
MaxGekk Nov 8, 2018
64a97a2
Merge remote-tracking branch 'origin/master' into decimal-parsing-locale
MaxGekk Nov 9, 2018
2f76352
Renaming decimalParser to decimalFormat
MaxGekk Nov 11, 2018
3dfce18
Moving the test to UnivocityParserSuite
MaxGekk Nov 11, 2018
bdca7c4
Support the SQL config spark.sql.legacy.decimalParsing.enabled
MaxGekk Nov 12, 2018
8c5593e
Updating the migration guide.
MaxGekk Nov 12, 2018
18470b0
Refactoring
MaxGekk Nov 12, 2018
c28b79f
Removing internal
MaxGekk Nov 12, 2018
1723da2
Test refactoring
MaxGekk Nov 12, 2018
6cdafa5
Added a test for inferring the decimal type
MaxGekk Nov 13, 2018
14b5109
Inferring decimals from CSV
MaxGekk Nov 14, 2018
bab8fb2
Renaming df to decimalFormat
MaxGekk Nov 22, 2018
5236336
Merge remote-tracking branch 'origin/master' into decimal-parsing-locale
MaxGekk Nov 23, 2018
0d1a4f0
Merge branch 'master' into decimal-parsing-locale
MaxGekk Nov 27, 2018
8b1456c
Merge remote-tracking branch 'origin/master' into decimal-parsing-locale
MaxGekk Nov 28, 2018
0859624
Removing SQL config and special handling of Locale.US
MaxGekk Nov 28, 2018
e989b77
Merge remote-tracking branch 'fork/decimal-parsing-locale' into decim…
MaxGekk Nov 28, 2018
521bd45
Merge remote-tracking branch 'origin/master' into decimal-parsing-locale
MaxGekk Nov 29, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sql-migration-guide-upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ displayTitle: Spark SQL Upgrading Guide

## Upgrading From Spark SQL 2.4 to 3.0

- In Spark version 2.4 and earlier, accepted format of decimals parsed from CSV is an optional sign ('+' or '-'), followed by a sequence of zero or more decimal digits, optionally followed by a fraction, optionally followed by an exponent. Any commas were removed from the input before parsing. Since Spark 3.0, format varies and depends on locale which can be set via CSV option `locale`. The default locale is `en-US`. To switch back to previous behavior, set `spark.sql.legacy.decimalParsing.enabled` to `true`.

- In PySpark, when creating a `SparkSession` with `SparkSession.builder.getOrCreate()`, if there is an existing `SparkContext`, the builder was trying to update the `SparkConf` of the existing `SparkContext` with configurations specified to the builder, but the `SparkContext` is shared by all `SparkSession`s, so we should not update them. Since 3.0, the builder comes to not update the configurations. This is the same behavior as Java/Scala API in 2.3 and above. If you want to update them, you need to update them prior to creating a `SparkSession`.

- In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.csv

import java.io.InputStream
import java.math.BigDecimal
import java.text.{DecimalFormat, DecimalFormatSymbols}

import scala.util.Try
import scala.util.control.NonFatal
Expand All @@ -29,6 +30,7 @@ import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{BadRecordException, DateTimeUtils, FailureSafeParser}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

Expand Down Expand Up @@ -104,6 +106,14 @@ class UnivocityParser(
requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray
}

private val decimalParser = if (SQLConf.get.legacyDecimalParsing) {
(s: String) => new BigDecimal(s.replaceAll(",", ""))
} else {
val df = new DecimalFormat("", new DecimalFormatSymbols(options.locale))
df.setParseBigDecimal(true)
(s: String) => df.parse(s).asInstanceOf[BigDecimal]
}

/**
* Create a converter which converts the string value to a value according to a desired type.
* Currently, we do not support complex types (`ArrayType`, `MapType`, `StructType`).
Expand Down Expand Up @@ -149,8 +159,7 @@ class UnivocityParser(

case dt: DecimalType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
val value = new BigDecimal(datum.replaceAll(",", ""))
Decimal(value, dt.precision, dt.scale)
Decimal(decimalParser(datum), dt.precision, dt.scale)
}

case _: TimestampType => (d: String) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1594,6 +1594,13 @@ object SQLConf {
"WHERE, which does not follow SQL standard.")
.booleanConf
.createWithDefault(false)

val LEGACY_DECIMAL_PARSING_ENABLED = buildConf("spark.sql.legacy.decimalParsing.enabled")
.doc("If it is set to false, it enables parsing decimals in locale specific formats. " +
"To switch back to previous behaviour when parsing was performed by java.math.BigDecimal " +
"and all commas were removed from the input, set the flag to true.")
.booleanConf
.createWithDefault(false)
}

/**
Expand Down Expand Up @@ -2009,6 +2016,8 @@ class SQLConf extends Serializable with Logging {

def integralDivideReturnLong: Boolean = getConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG)

def legacyDecimalParsing: Boolean = getConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED)

/** ********************** SQLConf functionality methods ************ */

/** Set Spark SQL configuration properties. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,17 @@
package org.apache.spark.sql.catalyst.csv

import java.math.BigDecimal
import java.text.{DecimalFormat, DecimalFormatSymbols}
import java.util.Locale

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

class UnivocityParserSuite extends SparkFunSuite {
class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
private val parser = new UnivocityParser(
StructType(Seq.empty),
new CSVOptions(Map.empty[String, String], false, "GMT"))
Expand Down Expand Up @@ -196,4 +200,34 @@ class UnivocityParserSuite extends SparkFunSuite {
assert(doubleVal2 == Double.PositiveInfinity)
}

test("parse decimals using locale") {
def checkDecimalParsing(langTag: String): Unit = {
val decimalVal = new BigDecimal("1000.001")
val decimalType = new DecimalType(10, 5)
val expected = Decimal(decimalVal, decimalType.precision, decimalType.scale)
val df = new DecimalFormat("", new DecimalFormatSymbols(Locale.forLanguageTag(langTag)))
val input = df.format(expected.toBigDecimal)

val options = new CSVOptions(Map("locale" -> langTag), false, "GMT")
val parser = new UnivocityParser(new StructType().add("d", decimalType), options)

assert(parser.makeConverter("_1", decimalType, options = options).apply(input) === expected)
}

withSQLConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED.key -> "false") {
Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalParsing)
}

withSQLConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED.key -> "true") {
Seq("en-US", "ko-KR").foreach(checkDecimalParsing)
}

withSQLConf(SQLConf.LEGACY_DECIMAL_PARSING_ENABLED.key -> "true") {
Seq("ru-RU").foreach { langTag =>
intercept[NumberFormatException] {
checkDecimalParsing(langTag)
}
}
}
}
}