-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19949][SQL] unify bad record handling in CSV and JSON #17315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
adfde77
6326c9d
10e70fe
b5aee0e
aa6736f
2df5d3b
20ac52f
adf7d33
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,17 +32,14 @@ import org.apache.spark.sql.types._ | |
| import org.apache.spark.unsafe.types.UTF8String | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| private[sql] class SparkSQLJsonProcessingException(msg: String) extends RuntimeException(msg) | ||
|
|
||
| /** | ||
| * Constructs a parser for a given schema that translates a json string to an [[InternalRow]]. | ||
| */ | ||
| class JacksonParser( | ||
| schema: StructType, | ||
| options: JSONOptions) extends Logging { | ||
| val options: JSONOptions) extends Logging { | ||
|
|
||
| import JacksonUtils._ | ||
| import ParseModes._ | ||
| import com.fasterxml.jackson.core.JsonToken._ | ||
|
|
||
| // A `ValueConverter` is responsible for converting a value from `JsonParser` | ||
|
|
@@ -55,107 +52,7 @@ class JacksonParser( | |
| private val factory = new JsonFactory() | ||
| options.setJacksonOptions(factory) | ||
|
|
||
| private val emptyRow: Seq[InternalRow] = Seq(new GenericInternalRow(schema.length)) | ||
|
|
||
| private val corruptFieldIndex = schema.getFieldIndex(options.columnNameOfCorruptRecord) | ||
| corruptFieldIndex.foreach { corrFieldIndex => | ||
| require(schema(corrFieldIndex).dataType == StringType) | ||
| require(schema(corrFieldIndex).nullable) | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The above checking sounds missing in the new codes.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a sanity check, actually this check is already done in |
||
|
|
||
| @transient | ||
| private[this] var isWarningPrinted: Boolean = false | ||
|
|
||
| @transient | ||
| private def printWarningForMalformedRecord(record: () => UTF8String): Unit = { | ||
| def sampleRecord: String = { | ||
| if (options.wholeFile) { | ||
| "" | ||
| } else { | ||
| s"Sample record: ${record()}\n" | ||
| } | ||
| } | ||
|
|
||
| def footer: String = { | ||
| s"""Code example to print all malformed records (scala): | ||
| |=================================================== | ||
| |// The corrupted record exists in column ${options.columnNameOfCorruptRecord}. | ||
| |val parsedJson = spark.read.json("/path/to/json/file/test.json") | ||
| | | ||
| """.stripMargin | ||
| } | ||
|
|
||
| if (options.permissive) { | ||
| logWarning( | ||
| s"""Found at least one malformed record. The JSON reader will replace | ||
| |all malformed records with placeholder null in current $PERMISSIVE_MODE parser mode. | ||
| |To find out which corrupted records have been replaced with null, please use the | ||
| |default inferred schema instead of providing a custom schema. | ||
| | | ||
| |${sampleRecord ++ footer} | ||
| | | ||
| """.stripMargin) | ||
| } else if (options.dropMalformed) { | ||
| logWarning( | ||
| s"""Found at least one malformed record. The JSON reader will drop | ||
| |all malformed records in current $DROP_MALFORMED_MODE parser mode. To find out which | ||
| |corrupted records have been dropped, please switch the parser mode to $PERMISSIVE_MODE | ||
| |mode and use the default inferred schema. | ||
| | | ||
| |${sampleRecord ++ footer} | ||
| | | ||
| """.stripMargin) | ||
| } | ||
| } | ||
|
|
||
| @transient | ||
| private def printWarningIfWholeFile(): Unit = { | ||
| if (options.wholeFile && corruptFieldIndex.isDefined) { | ||
| logWarning( | ||
| s"""Enabling wholeFile mode and defining columnNameOfCorruptRecord may result | ||
| |in very large allocations or OutOfMemoryExceptions being raised. | ||
| | | ||
| """.stripMargin) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * This function deals with the cases it fails to parse. This function will be called | ||
| * when exceptions are caught during converting. This functions also deals with `mode` option. | ||
| */ | ||
| private def failedRecord(record: () => UTF8String): Seq[InternalRow] = { | ||
| corruptFieldIndex match { | ||
| case _ if options.failFast => | ||
| if (options.wholeFile) { | ||
| throw new SparkSQLJsonProcessingException("Malformed line in FAILFAST mode") | ||
| } else { | ||
| throw new SparkSQLJsonProcessingException(s"Malformed line in FAILFAST mode: ${record()}") | ||
| } | ||
|
|
||
| case _ if options.dropMalformed => | ||
| if (!isWarningPrinted) { | ||
| printWarningForMalformedRecord(record) | ||
| isWarningPrinted = true | ||
| } | ||
| Nil | ||
|
|
||
| case None => | ||
| if (!isWarningPrinted) { | ||
| printWarningForMalformedRecord(record) | ||
| isWarningPrinted = true | ||
| } | ||
| emptyRow | ||
|
|
||
| case Some(corruptIndex) => | ||
| if (!isWarningPrinted) { | ||
| printWarningIfWholeFile() | ||
| isWarningPrinted = true | ||
| } | ||
| val row = new GenericInternalRow(schema.length) | ||
| row.update(corruptIndex, record()) | ||
| Seq(row) | ||
| } | ||
| } | ||
| private val emptyRow = new GenericInternalRow(schema.length) | ||
|
|
||
| /** | ||
| * Create a converter which converts the JSON documents held by the `JsonParser` | ||
|
|
@@ -239,7 +136,7 @@ class JacksonParser( | |
| lowerCaseValue.equals("-inf")) { | ||
| value.toFloat | ||
| } else { | ||
| throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.") | ||
| throw new RuntimeException(s"Cannot parse $value as FloatType.") | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -259,7 +156,7 @@ class JacksonParser( | |
| lowerCaseValue.equals("-inf")) { | ||
| value.toDouble | ||
| } else { | ||
| throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.") | ||
| throw new RuntimeException(s"Cannot parse $value as DoubleType.") | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -391,9 +288,9 @@ class JacksonParser( | |
|
|
||
| case token => | ||
| // We cannot parse this token based on the given data type. So, we throw a | ||
| // SparkSQLJsonProcessingException and this exception will be caught by | ||
| // SparkSQLRuntimeException and this exception will be caught by | ||
|
||
| // `parse` method. | ||
| throw new SparkSQLJsonProcessingException( | ||
| throw new RuntimeException( | ||
| s"Failed to parse a value for data type $dataType (current token: $token).") | ||
| } | ||
|
|
||
|
|
@@ -466,14 +363,14 @@ class JacksonParser( | |
| parser.nextToken() match { | ||
| case null => Nil | ||
| case _ => rootConverter.apply(parser) match { | ||
| case null => throw new SparkSQLJsonProcessingException("Root converter returned null") | ||
| case null => throw new RuntimeException("Root converter returned null") | ||
| case rows => rows | ||
| } | ||
| } | ||
| } | ||
| } catch { | ||
| case _: JsonProcessingException | _: SparkSQLJsonProcessingException => | ||
| failedRecord(() => recordLiteral(record)) | ||
| case e @ (_: RuntimeException | _: JsonProcessingException) => | ||
| throw BadRecordException(() => recordLiteral(record), () => emptyRow, e) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about creating an enum, like what we are doing for
SaveMode?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea this can be a good follow-up