apache · HyukjinKwon · Jun 5, 2017 · HyukjinKwon · Jun 7, 2017 · maropu
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -53,7 +53,8 @@ class UnivocityParser(
 
   // Retrieve the raw record string.
   private def getCurrentInput: UTF8String = {
-    UTF8String.fromString(tokenizer.getContext.currentParsedContent().stripLineEnd)
+    UTF8String.fromString(
+      Option(tokenizer.getContext.currentParsedContent()).map(_.stripLineEnd).orNull)
   }
 
   // This parser first picks some tokens from the input tokens, according to the required schema,

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1174,4 +1174,12 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         }
       }
   }
+
+  test("SPARK-20978: Set null for malformed column when the number of tokens is less than schema") {
+    val df = spark.read
+      .schema("a string, b string, unparsed string")
+      .option("columnNameOfCorruptRecord", "unparsed")
+      .csv(Seq("a").toDS())
+    checkAnswer(df, Row("a", null, null))
+  }
 }