diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 89bfcef4d9466..90a3f91047b14 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -72,6 +72,7 @@ hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 8df3858825e13..52bf28a2f36da 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 71e7fb6dd243d..6da8c491fd722 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index ba31391495f54..7039bdeb1b736 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar htrace-core-3.0.4.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index b129e5a99e2ff..afe4b2a5db884 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar htrace-core-3.1.0-incubating.jar httpclient-4.5.2.jar httpcore-4.4.4.jar +icu4j-58.1.jar ivy-2.4.0.jar jackson-annotations-2.6.5.jar jackson-core-2.6.5.jar diff --git a/pom.xml b/pom.xml index c391102d37502..6f2b1cebf7f7a 100644 --- a/pom.xml +++ b/pom.xml @@ -182,6 +182,7 @@ 2.8 1.8 1.0.0 + 58.1 ${java.home} diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 7da77158ff07e..77b086824e6e8 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -91,6 +91,11 @@ jackson-databind ${fasterxml.jackson.version} + + com.ibm.icu + icu4j + ${ibm.icu.version} + org.scalacheck scalacheck_${scala.binary.version} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index fcc02e5eb3ef9..5eb8d5d35d987 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -18,13 +18,15 @@ package org.apache.spark.sql import java.io.CharArrayWriter +import java.util.Locale import scala.collection.JavaConverters._ import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal -import org.apache.commons.lang3.StringUtils +import com.ibm.icu.lang.UCharacter +import com.ibm.icu.lang.UProperty import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability} import org.apache.spark.api.java.JavaRDD @@ -236,6 +238,29 @@ class Dataset[T] private[sql]( } } + val EAST_ASIAN_LANGS = Seq("ja", "vi", "kr", "zh") + + private def unicodeWidth(str: String): Int = { + val locale = Locale.getDefault() + if (locale == null) { + throw new NullPointerException("locale is null") + } + val ambiguousLen = if (EAST_ASIAN_LANGS.contains(locale.getLanguage())) 2 else 1 + var len = 0 + for (i <- 0 until str.length) { + val codePoint = str.codePointAt(i) + val value = UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH) + len = len + (value match { + case UCharacter.EastAsianWidth.NARROW | UCharacter.EastAsianWidth.NEUTRAL | + UCharacter.EastAsianWidth.HALFWIDTH => 1 + case UCharacter.EastAsianWidth.FULLWIDTH | UCharacter.EastAsianWidth.WIDE => 2 + case UCharacter.EastAsianWidth.AMBIGUOUS => ambiguousLen + case _ => 1 + }) + } + len + } + /** * Compose the string representing rows for output * @@ -275,36 +300,45 @@ class Dataset[T] private[sql]( val numCols = schema.fieldNames.length // Initialise the width of each column to a minimum value of '3' - val colWidths = Array.fill(numCols)(3) + val colMaxWidths = Array.fill(numCols)(3) + val colWidths = Array.ofDim[Int](rows.length, numCols) // Compute the width of each column + var j = 0 for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), cell.length) + val width = unicodeWidth(cell) + colWidths(j)(i) = width + colMaxWidths(i) = math.max(colMaxWidths(i), width) } + j = j + 1 } // Create SeparateLine - val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() + val sep: String = colMaxWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() // column names rows.head.zipWithIndex.map { case (cell, i) => + val paddingLen = colMaxWidths(i) - colWidths(0)(i) if (truncate > 0) { - StringUtils.leftPad(cell, colWidths(i)) + new StringBuilder(cell.length, " " * paddingLen).append(cell) } else { - StringUtils.rightPad(cell, colWidths(i)) + new StringBuilder(paddingLen, cell).append(" " * paddingLen) } }.addString(sb, "|", "|", "|\n") sb.append(sep) // data - rows.tail.map { - _.zipWithIndex.map { case (cell, i) => + j = 0 + rows.tail.map { row => + j = j + 1 + row.zipWithIndex.map { case (cell, i) => + val paddingLen = colMaxWidths(i) - colWidths(j)(i) if (truncate > 0) { - StringUtils.leftPad(cell.toString, colWidths(i)) + new StringBuilder(cell.length, " " * paddingLen).append(cell) } else { - StringUtils.rightPad(cell.toString, colWidths(i)) + new StringBuilder(paddingLen, cell).append(" " * paddingLen) } }.addString(sb, "|", "|", "|\n") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 1174d7354f931..7a6f535964e4e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1060,6 +1060,41 @@ class DatasetSuite extends QueryTest with SharedSQLContext { } assert(e.getMessage.contains("Cannot create encoder for Option of Product type")) } + + private def checkString[T](actual: String, expected: String): Unit = { + if (expected != actual) { + fail( + "Dataset.showString() gives wrong result:\n\n" + sideBySide( + "== Expected ==\n" + expected, + "== Actual ==\n" + actual + ).mkString("\n") + ) + } + } + + test("SPARK-18653: Dataset.show() should generate correct padding for Unicode Character") { + // scalastyle:off + val ds = Seq(UnicodeCaseClass(1, 1.1, "文字列1"), UnicodeCaseClass(-2, -2.2, "文字列")).toDS + val leftPadding = ds.showString(1, 99) + val rightPadding = ds.showString(1, -99) + checkString(leftPadding, + """+----+----+-------+ + ||整数|実数| s| + |+----+----+-------+ + || 1| 1.1|文字列1| + || -2|-2.2| 文字列| + |+----+----+-------+ + |""".stripMargin) + checkString(rightPadding, + """+----+----+-------+ + ||整数|実数|s | + |+----+----+-------+ + ||1 |1.1 |文字列1| + ||-2 |-2.2|文字列 | + |+----+----+-------+ + |""".stripMargin) + // scalastyle:on + } } case class Generic[T](id: T, value: Double) @@ -1135,3 +1170,6 @@ object DatasetTransform { case class Route(src: String, dest: String, cost: Int) case class GroupedRoutes(src: String, dest: String, routes: Seq[Route]) +// scalastyle:off +case class UnicodeCaseClass(整数: Int, 実数: Double, s: String) +// scalastyle:on