From 6d6b3ca8eedc7bf6381cb8d746fe8a3ee0a281b2 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 27 Feb 2018 18:41:01 +0100
Subject: [PATCH 1/9] Test: skip bytes disallowed in UTF-8

---
 sql/core/src/test/resources/test-data/utf8xFF.csv        | 3 +++
 .../spark/sql/execution/datasources/csv/CSVSuite.scala   | 9 +++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 sql/core/src/test/resources/test-data/utf8xFF.csv
diff --git a/sql/core/src/test/resources/test-data/utf8xFF.csv b/sql/core/src/test/resources/test-data/utf8xFF.csv
new file mode 100644
index 0000000000000..2c60488654e71
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/utf8xFF.csv
@@ -0,0 +1,3 @@
+keycode,alternatechannel,alternatesubchannel,corenoncore
+33MOKI33,Service Provider,United HealthCare,Core
+U9NAJ1,Community Memberships,AAGENÿ ,Core
\ No newline at end of file
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 4398e547d9217..91352219561ab 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1279,4 +1279,13 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
     )
   }
+
+  test("skip the first byte of a char if it is disallowed in UTF-8") {
+    val result = spark.read
+      .format("csv")
+      .option("header", "true")
+      .load(testFile("test-data/utf8xFF.csv"))
+
+    assert(result.count() == 2)
+  }
 }

From 0f474a033f152340519b10676026b80b7593c2f5 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 27 Feb 2018 18:43:53 +0100
Subject: [PATCH 2/9] Making correct map of first byte to char size in UTF-8
 and skip bytes disallowed in UTF-8

---
 .../apache/spark/unsafe/types/UTF8String.java | 44 +++++++++++++++----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index b0d0c44823e68..bbb3a66c576d3 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -57,12 +57,39 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
   public Object getBaseObject() { return base; }
   public long getBaseOffset() { return offset; }
 
-  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5,
-    6, 6};
+  /**
+   * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which
+   * indicates the size of the char. See Unicode standard in page 126:
+   * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
+   *
+   * Binary    Hex          Comments
+   * 0xxxxxxx  0x00..0x7F   Only byte of a 1-byte character encoding
+   * 10xxxxxx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
+   * 110xxxxx  0xC0..0xDF   First byte of a 2-byte character encoding
+   * 1110xxxx  0xE0..0xEF   First byte of a 3-byte character encoding
+   * 11110xxx  0xF0..0xF4   First byte of a 4-byte character encoding
+   */
+  private static byte[] bytesOfCodePointInUTF8 = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
+    // Continuation bytes cannot appear as the first byte
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90..0x9F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0..0xAF
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0..0xBF
+    0, 0, // 0xC0..0xC1 - disallowed in UTF-8
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC2..0xCF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0..0xDF
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0..0xEF
+    4, 4, 4, 4, 4, // 0xF0..0xF4
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0xF5..0xFF - disallowed in UTF-8
+  };
 
   private static final boolean IS_LITTLE_ENDIAN =
       ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
@@ -187,8 +214,9 @@ public void writeTo(OutputStream out) throws IOException {
    * @param b The first byte of a code point
    */
   private static int numBytesForFirstByte(final byte b) {
-    final int offset = (b & 0xFF) - 192;
-    return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
+    final int offset = b & 0xFF;
+    byte numBytes = bytesOfCodePointInUTF8[offset];
+    return (numBytes == 0) ? 1: numBytes; // Skip the first byte disallowed in UTF-8
   }
 
   /**

From 2ee661618a40544a8cbf3ac0794a1a6b86b6b62d Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 28 Feb 2018 14:26:21 +0100
Subject: [PATCH 3/9] Check inferred schema and returned bad string

---
 .../sql/execution/datasources/csv/CSVSuite.scala  | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 91352219561ab..3f95300e611b1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1281,11 +1281,22 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   }
 
   test("skip the first byte of a char if it is disallowed in UTF-8") {
-    val result = spark.read
+    val df = spark.read
       .format("csv")
       .option("header", "true")
       .load(testFile("test-data/utf8xFF.csv"))
+    val expectedSchema = new StructType()
+      .add("keycode", StringType)
+      .add("alternatechannel", StringType)
+      .add("alternatesubchannel", StringType)
+      .add("corenoncore", StringType)
+
+    assert(df.schema == expectedSchema)
 
-    assert(result.count() == 2)
+    val badStr = new String("AAGEN".getBytes :+ 0xff.toByte) ++ " "
+    checkAnswer(
+      df.select("alternatesubchannel"),
+      Row("United HealthCare") :: Row(badStr) :: Nil
+    )
   }
 }

From d6c5f02ea1a08513a54ea9f3b30986dd92188b3e Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 28 Feb 2018 15:04:22 +0100
Subject: [PATCH 4/9] The test csv was simplified

---
 sql/core/src/test/resources/test-data/utf8xFF.csv    |  6 +++---
 .../sql/execution/datasources/csv/CSVSuite.scala     | 12 +++++-------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/test/resources/test-data/utf8xFF.csv b/sql/core/src/test/resources/test-data/utf8xFF.csv
index 2c60488654e71..43eb098cb4c01 100644
--- a/sql/core/src/test/resources/test-data/utf8xFF.csv
+++ b/sql/core/src/test/resources/test-data/utf8xFF.csv
@@ -1,3 +1,3 @@
-keycode,alternatechannel,alternatesubchannel,corenoncore
-33MOKI33,Service Provider,United HealthCare,Core
-U9NAJ1,Community Memberships,AAGENÿ ,Core
\ No newline at end of file
+channel,code
+United,123
+ABGUNÿ,456
\ No newline at end of file
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 3f95300e611b1..27ffec7760b7f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1286,17 +1286,15 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       .option("header", "true")
       .load(testFile("test-data/utf8xFF.csv"))
     val expectedSchema = new StructType()
-      .add("keycode", StringType)
-      .add("alternatechannel", StringType)
-      .add("alternatesubchannel", StringType)
-      .add("corenoncore", StringType)
+      .add("channel", StringType)
+      .add("code", StringType)
 
     assert(df.schema == expectedSchema)
 
-    val badStr = new String("AAGEN".getBytes :+ 0xff.toByte) ++ " "
+    val badStr = new String("ABGUN".getBytes :+ 0xff.toByte)
     checkAnswer(
-      df.select("alternatesubchannel"),
-      Row("United HealthCare") :: Row(badStr) :: Nil
+      df.select("channel"),
+      Row("United") :: Row(badStr) :: Nil
     )
   }
 }

From 50b17b1287e1325234ad52e8b5401483df2f7bd2 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Thu, 15 Mar 2018 21:40:50 +0100
Subject: [PATCH 5/9] Test for handling of the first byte of UTF-8 chars

---
 .../src/test/resources/test-data/utf8xFF.csv  |  3 -
 .../execution/datasources/csv/CSVSuite.scala  | 63 ++++++++++++++-----
 2 files changed, 48 insertions(+), 18 deletions(-)
 delete mode 100644 sql/core/src/test/resources/test-data/utf8xFF.csv

diff --git a/sql/core/src/test/resources/test-data/utf8xFF.csv b/sql/core/src/test/resources/test-data/utf8xFF.csv
deleted file mode 100644
index 43eb098cb4c01..0000000000000
--- a/sql/core/src/test/resources/test-data/utf8xFF.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-channel,code
-United,123
-ABGUNÿ,456
\ No newline at end of file
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 27ffec7760b7f..ef92a5d9c1280 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.csv
 
 import java.io.File
 import java.nio.charset.UnsupportedCharsetException
+import java.nio.file.{Files, Paths}
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 import java.util.Locale
@@ -1280,21 +1281,53 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     )
   }
 
-  test("skip the first byte of a char if it is disallowed in UTF-8") {
-    val df = spark.read
-      .format("csv")
-      .option("header", "true")
-      .load(testFile("test-data/utf8xFF.csv"))
-    val expectedSchema = new StructType()
-      .add("channel", StringType)
-      .add("code", StringType)
-
-    assert(df.schema == expectedSchema)
+  def testHandlingUTF8Char(utf8char: Array[Byte]): Unit = {
+    val inHex = utf8char.map("%02x".format(_)).mkString("_")
+    test(s"SPARK-23649: handle the first byte of the char: $inHex") {
+      withTempPath { path =>
+        val filename = s"${path.getAbsolutePath}.csv"
+        val header = "code,channel\n".getBytes
+        val row = "ABGUN".getBytes ++ utf8char ++ ",United".getBytes
+        val content = header ++ row
+        Files.write(Paths.get(filename), content)
 
-    val badStr = new String("ABGUN".getBytes :+ 0xff.toByte)
-    checkAnswer(
-      df.select("channel"),
-      Row("United") :: Row(badStr) :: Nil
-    )
+        val df = spark.read
+          .format("csv")
+          .option("header", "true")
+          .load(filename)
+        val expectedSchema = new StructType()
+          .add("code", StringType)
+          .add("channel", StringType)
+
+        assert(df.schema == expectedSchema)
+
+        val badStr = new String("ABGUN".getBytes ++ utf8char)
+        checkAnswer(
+          df,
+          Row(badStr, "United") :: Nil
+        )
+      }
+    }
   }
+
+  Seq(
+    // Binary    Hex          Comments
+    // 0xxxxxxx  0x00..0x7F   Only byte of a 1-byte character encoding
+    Array(0x00), Array(0x7F),
+    // 10xxxxxx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
+    // The byte should be skipped because it cannot be the first byte
+    Array(0x80), Array(0xBF),
+    // 0xC0..0xC1 - disallowed in UTF-8
+    // The byte should be skipped because it cannot be the first byte
+    Array(0xC0), Array(0xC1),
+    // 110xxxxx  0xC0..0xDF   First byte of a 2-byte character encoding
+    Array(0xc2, 0x80), Array(0xdf, 0xbf),
+    // 1110xxxx  0xE0..0xEF   First byte of a 3-byte character encoding
+    Array(0xe0, 0xa0, 0x80), Array(0xef, 0xbf, 0xbf),
+    // 11110xxx  0xF0..0xF4   First byte of a 4-byte character encoding
+    Array(0xf0, 0x9f, 0x9c, 0x80), Array(0xf4, 0x80, 0x83, 0xbf),
+    // 0xF5..0xFF - disallowed in UTF-8
+    // The byte should be skipped because it cannot be the first byte
+    Array(0xF5), Array(0xFF)
+  ).map(a => a.map(_.toByte)).foreach(testHandlingUTF8Char(_))
 }

From c65f82782443c22cfcdc86dd01195fe8f623f2a4 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 16 Mar 2018 14:33:54 +0100
Subject: [PATCH 6/9] Using explicit charset in converting of a string to a
 byte array

---
 .../sql/execution/datasources/csv/CSVSuite.scala      | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index ef92a5d9c1280..1f2443f77faa2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.datasources.csv
 
 import java.io.File
-import java.nio.charset.UnsupportedCharsetException
+import java.nio.charset.{StandardCharsets, UnsupportedCharsetException}
 import java.nio.file.{Files, Paths}
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
@@ -1285,9 +1285,12 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     val inHex = utf8char.map("%02x".format(_)).mkString("_")
     test(s"SPARK-23649: handle the first byte of the char: $inHex") {
       withTempPath { path =>
+        def getBytes(str: String): Array[Byte] = {
+          str.getBytes(StandardCharsets.UTF_8)
+        }
         val filename = s"${path.getAbsolutePath}.csv"
-        val header = "code,channel\n".getBytes
-        val row = "ABGUN".getBytes ++ utf8char ++ ",United".getBytes
+        val header = getBytes("code,channel\n")
+        val row = getBytes("ABGUN") ++ utf8char ++ getBytes(",United")
         val content = header ++ row
         Files.write(Paths.get(filename), content)
 
@@ -1301,7 +1304,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
 
         assert(df.schema == expectedSchema)
 
-        val badStr = new String("ABGUN".getBytes ++ utf8char)
+        val badStr = new String(getBytes("ABGUN") ++ utf8char)
         checkAnswer(
           df,
           Row(badStr, "United") :: Nil

From 27e5a5b3023fc8dd65b65c41650ae22dd531e395 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 16 Mar 2018 15:53:54 +0100
Subject: [PATCH 7/9] Basic tests for 2-4 bytes chars and a test for skipping
 of wrong first byte of UTF-8 char

---
 .../spark/unsafe/types/UTF8StringSuite.java   | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 9b303fa5bc6c5..d244908f4b338 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -58,8 +58,12 @@ private static void checkBasic(String str, int len) {
   @Test
   public void basicTest() {
     checkBasic("", 0);
-    checkBasic("hello", 5);
+    checkBasic("Â¡", 1); // 2 bytes char
+    checkBasic("ÐºÑƒ", 2); // 2 * 2 bytes chars
+    checkBasic("hello", 5); // 5 * 1 byte chars
     checkBasic("å¤§ åƒ ä¸– ç•Œ", 7);
+    checkBasic("ï¸½ï¹‹ï¼…", 3); // 3 * 3 bytes chars
+    checkBasic("\uD83E\uDD19", 1); // 4 bytes char
   }
 
   @Test
@@ -791,4 +795,18 @@ public void trimRightWithTrimString() {
     assertEquals(fromString("å¤´"), fromString("å¤´a???/").trimRight(fromString("æ•°?/*&^%a")));
     assertEquals(fromString("å¤´"), fromString("å¤´æ•°bæ•°æ•° [").trimRight(fromString(" []æ•°b")));
   }
+
+  @Test
+  public void skipWrongFirstByte() {
+    int[] wrongFirstBytes = {
+      0x80, 0xBF, // Skip Continuation bytes
+      0xC0, 0xC2, // 0xC0..0xC1 - disallowed in UTF-8
+      0xF5, 0xF9, 0xFF // 0xF5..0xFF - disallowed in UTF-8
+    };
+    for (int i = 0; i < wrongFirstBytes.length; ++i) {
+      byte[] c = new byte[1];
+      c[0] = (byte)wrongFirstBytes[i];
+      assertEquals(fromBytes(c).numChars(), 1);
+    }
+  }
 }

From 8a501e359eab47f08c44dcdf3d88a0dec3f71470 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sat, 17 Mar 2018 10:46:13 +0100
Subject: [PATCH 8/9] Removing tests for CSV and adding additional cases for
 UTF8StringSuite

---
 .../spark/unsafe/types/UTF8StringSuite.java   |  9 ++-
 .../execution/datasources/csv/CSVSuite.scala  | 56 +------------------
 2 files changed, 7 insertions(+), 58 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index d244908f4b338..7c34d419574ef 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -799,12 +799,15 @@ public void trimRightWithTrimString() {
   @Test
   public void skipWrongFirstByte() {
     int[] wrongFirstBytes = {
-      0x80, 0xBF, // Skip Continuation bytes
+      0x80, 0x9F, 0xBF, // Skip Continuation bytes
       0xC0, 0xC2, // 0xC0..0xC1 - disallowed in UTF-8
-      0xF5, 0xF9, 0xFF // 0xF5..0xFF - disallowed in UTF-8
+      // 0xF5..0xFF - disallowed in UTF-8
+      0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
+      0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
     };
+    byte[] c = new byte[1];
+
     for (int i = 0; i < wrongFirstBytes.length; ++i) {
-      byte[] c = new byte[1];
       c[0] = (byte)wrongFirstBytes[i];
       assertEquals(fromBytes(c).numChars(), 1);
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 1f2443f77faa2..4398e547d9217 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.execution.datasources.csv
 
 import java.io.File
-import java.nio.charset.{StandardCharsets, UnsupportedCharsetException}
-import java.nio.file.{Files, Paths}
+import java.nio.charset.UnsupportedCharsetException
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 import java.util.Locale
@@ -1280,57 +1279,4 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
     )
   }
-
-  def testHandlingUTF8Char(utf8char: Array[Byte]): Unit = {
-    val inHex = utf8char.map("%02x".format(_)).mkString("_")
-    test(s"SPARK-23649: handle the first byte of the char: $inHex") {
-      withTempPath { path =>
-        def getBytes(str: String): Array[Byte] = {
-          str.getBytes(StandardCharsets.UTF_8)
-        }
-        val filename = s"${path.getAbsolutePath}.csv"
-        val header = getBytes("code,channel\n")
-        val row = getBytes("ABGUN") ++ utf8char ++ getBytes(",United")
-        val content = header ++ row
-        Files.write(Paths.get(filename), content)
-
-        val df = spark.read
-          .format("csv")
-          .option("header", "true")
-          .load(filename)
-        val expectedSchema = new StructType()
-          .add("code", StringType)
-          .add("channel", StringType)
-
-        assert(df.schema == expectedSchema)
-
-        val badStr = new String(getBytes("ABGUN") ++ utf8char)
-        checkAnswer(
-          df,
-          Row(badStr, "United") :: Nil
-        )
-      }
-    }
-  }
-
-  Seq(
-    // Binary    Hex          Comments
-    // 0xxxxxxx  0x00..0x7F   Only byte of a 1-byte character encoding
-    Array(0x00), Array(0x7F),
-    // 10xxxxxx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
-    // The byte should be skipped because it cannot be the first byte
-    Array(0x80), Array(0xBF),
-    // 0xC0..0xC1 - disallowed in UTF-8
-    // The byte should be skipped because it cannot be the first byte
-    Array(0xC0), Array(0xC1),
-    // 110xxxxx  0xC0..0xDF   First byte of a 2-byte character encoding
-    Array(0xc2, 0x80), Array(0xdf, 0xbf),
-    // 1110xxxx  0xE0..0xEF   First byte of a 3-byte character encoding
-    Array(0xe0, 0xa0, 0x80), Array(0xef, 0xbf, 0xbf),
-    // 11110xxx  0xF0..0xF4   First byte of a 4-byte character encoding
-    Array(0xf0, 0x9f, 0x9c, 0x80), Array(0xf4, 0x80, 0x83, 0xbf),
-    // 0xF5..0xFF - disallowed in UTF-8
-    // The byte should be skipped because it cannot be the first byte
-    Array(0xF5), Array(0xFF)
-  ).map(a => a.map(_.toByte)).foreach(testHandlingUTF8Char(_))
 }

From 5557a80d4674e929332d9441342e5b90e314eb45 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Mon, 19 Mar 2018 21:45:44 +0100
Subject: [PATCH 9/9] Comment about disallowed first bytes according to UTF-8
 standard

---
 .../java/org/apache/spark/unsafe/types/UTF8String.java    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index bbb3a66c576d3..5d468aed42337 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -59,7 +59,7 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
 
   /**
    * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which
-   * indicates the size of the char. See Unicode standard in page 126:
+   * indicates the size of the char. See Unicode standard in page 126, Table 3-6:
    * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
    *
    * Binary    Hex          Comments
@@ -67,7 +67,11 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
    * 10xxxxxx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
    * 110xxxxx  0xC0..0xDF   First byte of a 2-byte character encoding
    * 1110xxxx  0xE0..0xEF   First byte of a 3-byte character encoding
-   * 11110xxx  0xF0..0xF4   First byte of a 4-byte character encoding
+   * 11110xxx  0xF0..0xF7   First byte of a 4-byte character encoding
+   *
+   * As a consequence of the well-formedness conditions specified in
+   * Table 3-7 (page 126), the following byte values are disallowed in UTF-8:
+   *   C0â€“C1, F5â€“FF.
    */
   private static byte[] bytesOfCodePointInUTF8 = {
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F