Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.spark.sql.catalyst.csv

import java.util.Locale

import scala.util.control.Exception.allCatch

import org.apache.spark.rdd.RDD
Expand All @@ -32,7 +34,10 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
options.zoneId,
options.locale)

private val decimalParser = {
private val decimalParser = if (options.locale == Locale.US) {
Copy link
Member

@MaxGekk MaxGekk Apr 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks slightly ugly that we handle special case of Locale.US for JSON inside of ExprUtils.getDecimalParser but for CSV outside of it. Maybe we unify the implementation, and handle the special cases outside of generic implementation ExprUtils.getDecimalParser (or process both special cases inside of it)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.. we gotta do that I think .. Let's do that later when we drop the replacement thing entirely. I realised that CSV inference path alone handles it without replacement.

// Special handling the default locale for backward compatibility
s: String => new java.math.BigDecimal(s)
} else {
ExprUtils.getDecimalParser(options.locale)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
assert(inferSchema.inferField(NullType, input) == expectedType)
}

Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_, DecimalType(7, 0)))
// input like '1,0' is inferred as strings for backward compatibility.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I may not know all the context, but doesn't 1,0 mean 2 int columns?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The field separator here is | not ,. So 1,0 cannot be considered as separate columns.

Seq("en-US").foreach(checkDecimalInfer(_, StringType))
Seq("ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_, DecimalType(7, 0)))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2052,4 +2052,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te

checkAnswer(rows, expectedRows)
}

test("SPARK-27512: Decimal type inference should not handle ',' for backward compatibility") {
assert(spark.read
.option("delimiter", "|")
.option("inferSchema", "true")
.csv(Seq("1,2").toDS).schema.head.dataType === StringType)
}
}