From 6d6b3ca8eedc7bf6381cb8d746fe8a3ee0a281b2 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 27 Feb 2018 18:41:01 +0100 Subject: [PATCH 1/9] Test: skip bytes disallowed in UTF-8 --- sql/core/src/test/resources/test-data/utf8xFF.csv | 3 +++ .../spark/sql/execution/datasources/csv/CSVSuite.scala | 9 +++++++++ 2 files changed, 12 insertions(+) create mode 100644 sql/core/src/test/resources/test-data/utf8xFF.csv diff --git a/sql/core/src/test/resources/test-data/utf8xFF.csv b/sql/core/src/test/resources/test-data/utf8xFF.csv new file mode 100644 index 0000000000000..2c60488654e71 --- /dev/null +++ b/sql/core/src/test/resources/test-data/utf8xFF.csv @@ -0,0 +1,3 @@ +keycode,alternatechannel,alternatesubchannel,corenoncore +33MOKI33,Service Provider,United HealthCare,Core +U9NAJ1,Community Memberships,AAGEN ,Core \ No newline at end of file diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 4398e547d9217..91352219561ab 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1279,4 +1279,13 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil ) } + + test("skip the first byte of a char if it is disallowed in UTF-8") { + val result = spark.read + .format("csv") + .option("header", "true") + .load(testFile("test-data/utf8xFF.csv")) + + assert(result.count() == 2) + } } From 0f474a033f152340519b10676026b80b7593c2f5 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 27 Feb 2018 18:43:53 +0100 Subject: [PATCH 2/9] Making correct map of first byte to char size in UTF-8 and skip bytes disallowed in UTF-8 --- .../apache/spark/unsafe/types/UTF8String.java | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index b0d0c44823e68..bbb3a66c576d3 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -57,12 +57,39 @@ public final class UTF8String implements Comparable, Externalizable, public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } - private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, - 6, 6}; + /** + * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which + * indicates the size of the char. See Unicode standard in page 126: + * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf + * + * Binary Hex Comments + * 0xxxxxxx 0x00..0x7F Only byte of a 1-byte character encoding + * 10xxxxxx 0x80..0xBF Continuation bytes (1-3 continuation bytes) + * 110xxxxx 0xC0..0xDF First byte of a 2-byte character encoding + * 1110xxxx 0xE0..0xEF First byte of a 3-byte character encoding + * 11110xxx 0xF0..0xF4 First byte of a 4-byte character encoding + */ + private static byte[] bytesOfCodePointInUTF8 = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F + // Continuation bytes cannot appear as the first byte + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90..0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0..0xAF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0..0xBF + 0, 0, // 0xC0..0xC1 - disallowed in UTF-8 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC2..0xCF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0..0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0..0xEF + 4, 4, 4, 4, 4, // 0xF0..0xF4 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0xF5..0xFF - disallowed in UTF-8 + }; private static final boolean IS_LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; @@ -187,8 +214,9 @@ public void writeTo(OutputStream out) throws IOException { * @param b The first byte of a code point */ private static int numBytesForFirstByte(final byte b) { - final int offset = (b & 0xFF) - 192; - return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1; + final int offset = b & 0xFF; + byte numBytes = bytesOfCodePointInUTF8[offset]; + return (numBytes == 0) ? 1: numBytes; // Skip the first byte disallowed in UTF-8 } /** From 2ee661618a40544a8cbf3ac0794a1a6b86b6b62d Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 28 Feb 2018 14:26:21 +0100 Subject: [PATCH 3/9] Check inferred schema and returned bad string --- .../sql/execution/datasources/csv/CSVSuite.scala | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 91352219561ab..3f95300e611b1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1281,11 +1281,22 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } test("skip the first byte of a char if it is disallowed in UTF-8") { - val result = spark.read + val df = spark.read .format("csv") .option("header", "true") .load(testFile("test-data/utf8xFF.csv")) + val expectedSchema = new StructType() + .add("keycode", StringType) + .add("alternatechannel", StringType) + .add("alternatesubchannel", StringType) + .add("corenoncore", StringType) + + assert(df.schema == expectedSchema) - assert(result.count() == 2) + val badStr = new String("AAGEN".getBytes :+ 0xff.toByte) ++ " " + checkAnswer( + df.select("alternatesubchannel"), + Row("United HealthCare") :: Row(badStr) :: Nil + ) } } From d6c5f02ea1a08513a54ea9f3b30986dd92188b3e Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 28 Feb 2018 15:04:22 +0100 Subject: [PATCH 4/9] The test csv was simplified --- sql/core/src/test/resources/test-data/utf8xFF.csv | 6 +++--- .../sql/execution/datasources/csv/CSVSuite.scala | 12 +++++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/resources/test-data/utf8xFF.csv b/sql/core/src/test/resources/test-data/utf8xFF.csv index 2c60488654e71..43eb098cb4c01 100644 --- a/sql/core/src/test/resources/test-data/utf8xFF.csv +++ b/sql/core/src/test/resources/test-data/utf8xFF.csv @@ -1,3 +1,3 @@ -keycode,alternatechannel,alternatesubchannel,corenoncore -33MOKI33,Service Provider,United HealthCare,Core -U9NAJ1,Community Memberships,AAGEN ,Core \ No newline at end of file +channel,code +United,123 +ABGUN,456 \ No newline at end of file diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 3f95300e611b1..27ffec7760b7f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1286,17 +1286,15 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { .option("header", "true") .load(testFile("test-data/utf8xFF.csv")) val expectedSchema = new StructType() - .add("keycode", StringType) - .add("alternatechannel", StringType) - .add("alternatesubchannel", StringType) - .add("corenoncore", StringType) + .add("channel", StringType) + .add("code", StringType) assert(df.schema == expectedSchema) - val badStr = new String("AAGEN".getBytes :+ 0xff.toByte) ++ " " + val badStr = new String("ABGUN".getBytes :+ 0xff.toByte) checkAnswer( - df.select("alternatesubchannel"), - Row("United HealthCare") :: Row(badStr) :: Nil + df.select("channel"), + Row("United") :: Row(badStr) :: Nil ) } } From 50b17b1287e1325234ad52e8b5401483df2f7bd2 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 15 Mar 2018 21:40:50 +0100 Subject: [PATCH 5/9] Test for handling of the first byte of UTF-8 chars --- .../src/test/resources/test-data/utf8xFF.csv | 3 - .../execution/datasources/csv/CSVSuite.scala | 63 ++++++++++++++----- 2 files changed, 48 insertions(+), 18 deletions(-) delete mode 100644 sql/core/src/test/resources/test-data/utf8xFF.csv diff --git a/sql/core/src/test/resources/test-data/utf8xFF.csv b/sql/core/src/test/resources/test-data/utf8xFF.csv deleted file mode 100644 index 43eb098cb4c01..0000000000000 --- a/sql/core/src/test/resources/test-data/utf8xFF.csv +++ /dev/null @@ -1,3 +0,0 @@ -channel,code -United,123 -ABGUN,456 \ No newline at end of file diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 27ffec7760b7f..ef92a5d9c1280 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.csv import java.io.File import java.nio.charset.UnsupportedCharsetException +import java.nio.file.{Files, Paths} import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat import java.util.Locale @@ -1280,21 +1281,53 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { ) } - test("skip the first byte of a char if it is disallowed in UTF-8") { - val df = spark.read - .format("csv") - .option("header", "true") - .load(testFile("test-data/utf8xFF.csv")) - val expectedSchema = new StructType() - .add("channel", StringType) - .add("code", StringType) - - assert(df.schema == expectedSchema) + def testHandlingUTF8Char(utf8char: Array[Byte]): Unit = { + val inHex = utf8char.map("%02x".format(_)).mkString("_") + test(s"SPARK-23649: handle the first byte of the char: $inHex") { + withTempPath { path => + val filename = s"${path.getAbsolutePath}.csv" + val header = "code,channel\n".getBytes + val row = "ABGUN".getBytes ++ utf8char ++ ",United".getBytes + val content = header ++ row + Files.write(Paths.get(filename), content) - val badStr = new String("ABGUN".getBytes :+ 0xff.toByte) - checkAnswer( - df.select("channel"), - Row("United") :: Row(badStr) :: Nil - ) + val df = spark.read + .format("csv") + .option("header", "true") + .load(filename) + val expectedSchema = new StructType() + .add("code", StringType) + .add("channel", StringType) + + assert(df.schema == expectedSchema) + + val badStr = new String("ABGUN".getBytes ++ utf8char) + checkAnswer( + df, + Row(badStr, "United") :: Nil + ) + } + } } + + Seq( + // Binary Hex Comments + // 0xxxxxxx 0x00..0x7F Only byte of a 1-byte character encoding + Array(0x00), Array(0x7F), + // 10xxxxxx 0x80..0xBF Continuation bytes (1-3 continuation bytes) + // The byte should be skipped because it cannot be the first byte + Array(0x80), Array(0xBF), + // 0xC0..0xC1 - disallowed in UTF-8 + // The byte should be skipped because it cannot be the first byte + Array(0xC0), Array(0xC1), + // 110xxxxx 0xC0..0xDF First byte of a 2-byte character encoding + Array(0xc2, 0x80), Array(0xdf, 0xbf), + // 1110xxxx 0xE0..0xEF First byte of a 3-byte character encoding + Array(0xe0, 0xa0, 0x80), Array(0xef, 0xbf, 0xbf), + // 11110xxx 0xF0..0xF4 First byte of a 4-byte character encoding + Array(0xf0, 0x9f, 0x9c, 0x80), Array(0xf4, 0x80, 0x83, 0xbf), + // 0xF5..0xFF - disallowed in UTF-8 + // The byte should be skipped because it cannot be the first byte + Array(0xF5), Array(0xFF) + ).map(a => a.map(_.toByte)).foreach(testHandlingUTF8Char(_)) } From c65f82782443c22cfcdc86dd01195fe8f623f2a4 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 16 Mar 2018 14:33:54 +0100 Subject: [PATCH 6/9] Using explicit charset in converting of a string to a byte array --- .../sql/execution/datasources/csv/CSVSuite.scala | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index ef92a5d9c1280..1f2443f77faa2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.datasources.csv import java.io.File -import java.nio.charset.UnsupportedCharsetException +import java.nio.charset.{StandardCharsets, UnsupportedCharsetException} import java.nio.file.{Files, Paths} import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat @@ -1285,9 +1285,12 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { val inHex = utf8char.map("%02x".format(_)).mkString("_") test(s"SPARK-23649: handle the first byte of the char: $inHex") { withTempPath { path => + def getBytes(str: String): Array[Byte] = { + str.getBytes(StandardCharsets.UTF_8) + } val filename = s"${path.getAbsolutePath}.csv" - val header = "code,channel\n".getBytes - val row = "ABGUN".getBytes ++ utf8char ++ ",United".getBytes + val header = getBytes("code,channel\n") + val row = getBytes("ABGUN") ++ utf8char ++ getBytes(",United") val content = header ++ row Files.write(Paths.get(filename), content) @@ -1301,7 +1304,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { assert(df.schema == expectedSchema) - val badStr = new String("ABGUN".getBytes ++ utf8char) + val badStr = new String(getBytes("ABGUN") ++ utf8char) checkAnswer( df, Row(badStr, "United") :: Nil From 27e5a5b3023fc8dd65b65c41650ae22dd531e395 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 16 Mar 2018 15:53:54 +0100 Subject: [PATCH 7/9] Basic tests for 2-4 bytes chars and a test for skipping of wrong first byte of UTF-8 char --- .../spark/unsafe/types/UTF8StringSuite.java | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 9b303fa5bc6c5..d244908f4b338 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -58,8 +58,12 @@ private static void checkBasic(String str, int len) { @Test public void basicTest() { checkBasic("", 0); - checkBasic("hello", 5); + checkBasic("¡", 1); // 2 bytes char + checkBasic("ку", 2); // 2 * 2 bytes chars + checkBasic("hello", 5); // 5 * 1 byte chars checkBasic("大 千 世 界", 7); + checkBasic("︽﹋%", 3); // 3 * 3 bytes chars + checkBasic("\uD83E\uDD19", 1); // 4 bytes char } @Test @@ -791,4 +795,18 @@ public void trimRightWithTrimString() { assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a"))); assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b"))); } + + @Test + public void skipWrongFirstByte() { + int[] wrongFirstBytes = { + 0x80, 0xBF, // Skip Continuation bytes + 0xC0, 0xC2, // 0xC0..0xC1 - disallowed in UTF-8 + 0xF5, 0xF9, 0xFF // 0xF5..0xFF - disallowed in UTF-8 + }; + for (int i = 0; i < wrongFirstBytes.length; ++i) { + byte[] c = new byte[1]; + c[0] = (byte)wrongFirstBytes[i]; + assertEquals(fromBytes(c).numChars(), 1); + } + } } From 8a501e359eab47f08c44dcdf3d88a0dec3f71470 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 17 Mar 2018 10:46:13 +0100 Subject: [PATCH 8/9] Removing tests for CSV and adding additional cases for UTF8StringSuite --- .../spark/unsafe/types/UTF8StringSuite.java | 9 ++- .../execution/datasources/csv/CSVSuite.scala | 56 +------------------ 2 files changed, 7 insertions(+), 58 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index d244908f4b338..7c34d419574ef 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -799,12 +799,15 @@ public void trimRightWithTrimString() { @Test public void skipWrongFirstByte() { int[] wrongFirstBytes = { - 0x80, 0xBF, // Skip Continuation bytes + 0x80, 0x9F, 0xBF, // Skip Continuation bytes 0xC0, 0xC2, // 0xC0..0xC1 - disallowed in UTF-8 - 0xF5, 0xF9, 0xFF // 0xF5..0xFF - disallowed in UTF-8 + // 0xF5..0xFF - disallowed in UTF-8 + 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, + 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF }; + byte[] c = new byte[1]; + for (int i = 0; i < wrongFirstBytes.length; ++i) { - byte[] c = new byte[1]; c[0] = (byte)wrongFirstBytes[i]; assertEquals(fromBytes(c).numChars(), 1); } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 1f2443f77faa2..4398e547d9217 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -18,8 +18,7 @@ package org.apache.spark.sql.execution.datasources.csv import java.io.File -import java.nio.charset.{StandardCharsets, UnsupportedCharsetException} -import java.nio.file.{Files, Paths} +import java.nio.charset.UnsupportedCharsetException import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat import java.util.Locale @@ -1280,57 +1279,4 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil ) } - - def testHandlingUTF8Char(utf8char: Array[Byte]): Unit = { - val inHex = utf8char.map("%02x".format(_)).mkString("_") - test(s"SPARK-23649: handle the first byte of the char: $inHex") { - withTempPath { path => - def getBytes(str: String): Array[Byte] = { - str.getBytes(StandardCharsets.UTF_8) - } - val filename = s"${path.getAbsolutePath}.csv" - val header = getBytes("code,channel\n") - val row = getBytes("ABGUN") ++ utf8char ++ getBytes(",United") - val content = header ++ row - Files.write(Paths.get(filename), content) - - val df = spark.read - .format("csv") - .option("header", "true") - .load(filename) - val expectedSchema = new StructType() - .add("code", StringType) - .add("channel", StringType) - - assert(df.schema == expectedSchema) - - val badStr = new String(getBytes("ABGUN") ++ utf8char) - checkAnswer( - df, - Row(badStr, "United") :: Nil - ) - } - } - } - - Seq( - // Binary Hex Comments - // 0xxxxxxx 0x00..0x7F Only byte of a 1-byte character encoding - Array(0x00), Array(0x7F), - // 10xxxxxx 0x80..0xBF Continuation bytes (1-3 continuation bytes) - // The byte should be skipped because it cannot be the first byte - Array(0x80), Array(0xBF), - // 0xC0..0xC1 - disallowed in UTF-8 - // The byte should be skipped because it cannot be the first byte - Array(0xC0), Array(0xC1), - // 110xxxxx 0xC0..0xDF First byte of a 2-byte character encoding - Array(0xc2, 0x80), Array(0xdf, 0xbf), - // 1110xxxx 0xE0..0xEF First byte of a 3-byte character encoding - Array(0xe0, 0xa0, 0x80), Array(0xef, 0xbf, 0xbf), - // 11110xxx 0xF0..0xF4 First byte of a 4-byte character encoding - Array(0xf0, 0x9f, 0x9c, 0x80), Array(0xf4, 0x80, 0x83, 0xbf), - // 0xF5..0xFF - disallowed in UTF-8 - // The byte should be skipped because it cannot be the first byte - Array(0xF5), Array(0xFF) - ).map(a => a.map(_.toByte)).foreach(testHandlingUTF8Char(_)) } From 5557a80d4674e929332d9441342e5b90e314eb45 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 19 Mar 2018 21:45:44 +0100 Subject: [PATCH 9/9] Comment about disallowed first bytes according to UTF-8 standard --- .../java/org/apache/spark/unsafe/types/UTF8String.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index bbb3a66c576d3..5d468aed42337 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -59,7 +59,7 @@ public final class UTF8String implements Comparable, Externalizable, /** * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which - * indicates the size of the char. See Unicode standard in page 126: + * indicates the size of the char. See Unicode standard in page 126, Table 3-6: * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf * * Binary Hex Comments @@ -67,7 +67,11 @@ public final class UTF8String implements Comparable, Externalizable, * 10xxxxxx 0x80..0xBF Continuation bytes (1-3 continuation bytes) * 110xxxxx 0xC0..0xDF First byte of a 2-byte character encoding * 1110xxxx 0xE0..0xEF First byte of a 3-byte character encoding - * 11110xxx 0xF0..0xF4 First byte of a 4-byte character encoding + * 11110xxx 0xF0..0xF7 First byte of a 4-byte character encoding + * + * As a consequence of the well-formedness conditions specified in + * Table 3-7 (page 126), the following byte values are disallowed in UTF-8: + * C0–C1, F5–FF. */ private static byte[] bytesOfCodePointInUTF8 = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F