diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 89bfcef4d9466..90a3f91047b14 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -72,6 +72,7 @@ hk2-locator-2.4.0-b34.jar
hk2-utils-2.4.0-b34.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
+icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 8df3858825e13..52bf28a2f36da 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar
hk2-utils-2.4.0-b34.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
+icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 71e7fb6dd243d..6da8c491fd722 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -74,6 +74,7 @@ hk2-locator-2.4.0-b34.jar
hk2-utils-2.4.0-b34.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
+icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index ba31391495f54..7039bdeb1b736 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar
htrace-core-3.0.4.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
+icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index b129e5a99e2ff..afe4b2a5db884 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -80,6 +80,7 @@ hk2-utils-2.4.0-b34.jar
htrace-core-3.1.0-incubating.jar
httpclient-4.5.2.jar
httpcore-4.4.4.jar
+icu4j-58.1.jar
ivy-2.4.0.jar
jackson-annotations-2.6.5.jar
jackson-core-2.6.5.jar
diff --git a/pom.xml b/pom.xml
index c391102d37502..6f2b1cebf7f7a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -182,6 +182,7 @@
2.8
1.8
1.0.0
+ 58.1
${java.home}
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 7da77158ff07e..77b086824e6e8 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -91,6 +91,11 @@
jackson-databind
${fasterxml.jackson.version}
+
+ com.ibm.icu
+ icu4j
+ ${ibm.icu.version}
+
org.scalacheck
scalacheck_${scala.binary.version}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index fcc02e5eb3ef9..5eb8d5d35d987 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -18,13 +18,15 @@
package org.apache.spark.sql
import java.io.CharArrayWriter
+import java.util.Locale
import scala.collection.JavaConverters._
import scala.language.implicitConversions
import scala.reflect.runtime.universe.TypeTag
import scala.util.control.NonFatal
-import org.apache.commons.lang3.StringUtils
+import com.ibm.icu.lang.UCharacter
+import com.ibm.icu.lang.UProperty
import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability}
import org.apache.spark.api.java.JavaRDD
@@ -236,6 +238,29 @@ class Dataset[T] private[sql](
}
}
+ val EAST_ASIAN_LANGS = Seq("ja", "vi", "kr", "zh")
+
+ private def unicodeWidth(str: String): Int = {
+ val locale = Locale.getDefault()
+ if (locale == null) {
+ throw new NullPointerException("locale is null")
+ }
+ val ambiguousLen = if (EAST_ASIAN_LANGS.contains(locale.getLanguage())) 2 else 1
+ var len = 0
+ for (i <- 0 until str.length) {
+ val codePoint = str.codePointAt(i)
+ val value = UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH)
+ len = len + (value match {
+ case UCharacter.EastAsianWidth.NARROW | UCharacter.EastAsianWidth.NEUTRAL |
+ UCharacter.EastAsianWidth.HALFWIDTH => 1
+ case UCharacter.EastAsianWidth.FULLWIDTH | UCharacter.EastAsianWidth.WIDE => 2
+ case UCharacter.EastAsianWidth.AMBIGUOUS => ambiguousLen
+ case _ => 1
+ })
+ }
+ len
+ }
+
/**
* Compose the string representing rows for output
*
@@ -275,36 +300,45 @@ class Dataset[T] private[sql](
val numCols = schema.fieldNames.length
// Initialise the width of each column to a minimum value of '3'
- val colWidths = Array.fill(numCols)(3)
+ val colMaxWidths = Array.fill(numCols)(3)
+ val colWidths = Array.ofDim[Int](rows.length, numCols)
// Compute the width of each column
+ var j = 0
for (row <- rows) {
for ((cell, i) <- row.zipWithIndex) {
- colWidths(i) = math.max(colWidths(i), cell.length)
+ val width = unicodeWidth(cell)
+ colWidths(j)(i) = width
+ colMaxWidths(i) = math.max(colMaxWidths(i), width)
}
+ j = j + 1
}
// Create SeparateLine
- val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
+ val sep: String = colMaxWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
// column names
rows.head.zipWithIndex.map { case (cell, i) =>
+ val paddingLen = colMaxWidths(i) - colWidths(0)(i)
if (truncate > 0) {
- StringUtils.leftPad(cell, colWidths(i))
+ new StringBuilder(cell.length, " " * paddingLen).append(cell)
} else {
- StringUtils.rightPad(cell, colWidths(i))
+ new StringBuilder(paddingLen, cell).append(" " * paddingLen)
}
}.addString(sb, "|", "|", "|\n")
sb.append(sep)
// data
- rows.tail.map {
- _.zipWithIndex.map { case (cell, i) =>
+ j = 0
+ rows.tail.map { row =>
+ j = j + 1
+ row.zipWithIndex.map { case (cell, i) =>
+ val paddingLen = colMaxWidths(i) - colWidths(j)(i)
if (truncate > 0) {
- StringUtils.leftPad(cell.toString, colWidths(i))
+ new StringBuilder(cell.length, " " * paddingLen).append(cell)
} else {
- StringUtils.rightPad(cell.toString, colWidths(i))
+ new StringBuilder(paddingLen, cell).append(" " * paddingLen)
}
}.addString(sb, "|", "|", "|\n")
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 1174d7354f931..7a6f535964e4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1060,6 +1060,41 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
}
assert(e.getMessage.contains("Cannot create encoder for Option of Product type"))
}
+
+ private def checkString[T](actual: String, expected: String): Unit = {
+ if (expected != actual) {
+ fail(
+ "Dataset.showString() gives wrong result:\n\n" + sideBySide(
+ "== Expected ==\n" + expected,
+ "== Actual ==\n" + actual
+ ).mkString("\n")
+ )
+ }
+ }
+
+ test("SPARK-18653: Dataset.show() should generate correct padding for Unicode Character") {
+ // scalastyle:off
+ val ds = Seq(UnicodeCaseClass(1, 1.1, "文字列1"), UnicodeCaseClass(-2, -2.2, "文字列")).toDS
+ val leftPadding = ds.showString(1, 99)
+ val rightPadding = ds.showString(1, -99)
+ checkString(leftPadding,
+ """+----+----+-------+
+ ||整数|実数| s|
+ |+----+----+-------+
+ || 1| 1.1|文字列1|
+ || -2|-2.2| 文字列|
+ |+----+----+-------+
+ |""".stripMargin)
+ checkString(rightPadding,
+ """+----+----+-------+
+ ||整数|実数|s |
+ |+----+----+-------+
+ ||1 |1.1 |文字列1|
+ ||-2 |-2.2|文字列 |
+ |+----+----+-------+
+ |""".stripMargin)
+ // scalastyle:on
+ }
}
case class Generic[T](id: T, value: Double)
@@ -1135,3 +1170,6 @@ object DatasetTransform {
case class Route(src: String, dest: String, cost: Int)
case class GroupedRoutes(src: String, dest: String, routes: Seq[Route])
+// scalastyle:off
+case class UnicodeCaseClass(整数: Int, 実数: Double, s: String)
+// scalastyle:on