From f3082d35332615c2790af6a63b0e93be4ef281ce Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 24 May 2019 22:47:54 -0700 Subject: [PATCH 1/7] Optimize UTF8String.replace() / StringReplace expression. --- common/unsafe/pom.xml | 9 ++-- .../apache/spark/unsafe/types/UTF8String.java | 32 +++++++++++-- .../expressions/stringExpressions.scala | 48 +++++++++++++++++-- .../expressions/StringExpressionsSuite.scala | 16 +++---- 4 files changed, 86 insertions(+), 19 deletions(-) diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml index 93a4f67fd23f2..024a907f70809 100644 --- a/common/unsafe/pom.xml +++ b/common/unsafe/pom.xml @@ -65,6 +65,10 @@ com.google.guava guava + + org.apache.commons + commons-lang3 + @@ -84,11 +88,6 @@ scalacheck_${scala.binary.version} test - - org.apache.commons - commons-lang3 - test - org.apache.commons commons-text diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 3a3bfc4a94bb3..1462373fca352 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -25,6 +25,7 @@ import java.util.Arrays; import java.util.Map; +import org.apache.commons.lang3.StringUtils; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.KryoSerializable; import com.esotericsoftware.kryo.io.Input; @@ -976,9 +977,34 @@ public UTF8String replace(UTF8String search, UTF8String replace) { if (EMPTY_UTF8.equals(search)) { return this; } - String replaced = toString().replace( - search.toString(), replace.toString()); - return fromString(replaced); + return replace(search.toString(), replace.toString()); + } + + public UTF8String replace(String search, String replace) { + // In Java 8, String.replace() is implemented using a regex and is therefore + // somewhat inefficient (see https://bugs.openjdk.java.net/browse/JDK-8058779). + // This is fixed in Java 9, but in Java 8 we can use Commons StringUtils instead: + String before = toString(); + String after = StringUtils.replace(before, search, replace); + // Use reference equality to cheaply detect whether the replacement had no effect, + // in which case we can simply return the original UTF8String and save some copying. + if (before == after) { + return this; + } else { + return fromString(after); + } + } + + public UTF8String replace(char search, char replace) { + String before = toString(); + String after = before.replace(search, replace); + // Use reference equality to cheaply detect whether the replacement had no effect, + // in which case we can simply return the original UTF8String and save some copying. + if (before == after) { + return this; + } else { + return fromString(after); + } } // TODO: Need to use `Code Point` here instead of Char in case the character longer than 2 bytes diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 6145ee676047d..d38c4bbfbc2a1 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -25,6 +25,7 @@ import java.util.regex.Pattern import scala.collection.mutable.ArrayBuffer import org.apache.commons.codec.binary.{Base64 => CommonsBase64} +import org.apache.commons.lang3.StringEscapeUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult @@ -443,9 +444,50 @@ case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExp } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (src, search, replace) => { - s"""${ev.value} = $src.replace($search, $replace);""" - }) + if (searchExpr.foldable && replaceExpr.foldable) { + // The search and replacement strings are constants, so we can use a more optimized path + // which avoids repeatedly decoding those UTF8Strings. + val search = searchExpr.eval() + val replace = replaceExpr.eval() + if (search == null || replace == null) { + // Either the search or replacement is null, so the entire expression evaluates to null + ev.copy(code = code""" + boolean ${ev.isNull} = true; + ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; + """) + } else { + val searchStr: String = search.asInstanceOf[UTF8String].toString + val replaceStr: String = replace.asInstanceOf[UTF8String].toString + val escapedSearchStr = StringEscapeUtils.escapeJava(searchStr) + val escapedReplaceStr = StringEscapeUtils.escapeJava(replaceStr) + val q: String = { + if (searchStr.length == 1 && replaceStr.length == 1) { + // Both strings are single characters, so embed them as character literals + // in order to use a single-character-replacement fast path. + "'" + } else { + // Use slower string literal replacement path (double quotes for string literal) + "\"" + } + } + // We don't use nullSafeCodeGen here because we don't want to re-evaluate the search + // and replace expressions. + val eval = srcExpr.genCode(ctx) + ev.copy(code = code""" + ${eval.code} + boolean ${ev.isNull} = ${eval.isNull}; + ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; + if (!${ev.isNull}) { + ${ev.value} = ${eval.value}.replace($q$escapedSearchStr$q, $q$escapedReplaceStr$q); + } + """) + } + } else { + // Either the search or replace expression is non-foldable, so use a slower path: + nullSafeCodeGen(ctx, ev, (src, search, replace) => { + s"""${ev.value} = $src.replace($search, $replace);""" + }) + } } override def dataType: DataType = StringType diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 1e7737b6a88ef..15c7b7e24fd24 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -409,19 +409,19 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("replace") { - checkEvaluation( - StringReplace(Literal("replace"), Literal("pl"), Literal("123")), "re123ace") + val nullString = Literal.create(null, StringType) + checkEvaluation(StringReplace(Literal("replace"), Literal("pl"), Literal("123")), "re123ace") checkEvaluation(StringReplace(Literal("replace"), Literal("pl"), Literal("")), "reace") checkEvaluation(StringReplace(Literal("replace"), Literal(""), Literal("123")), "replace") - checkEvaluation(StringReplace(Literal.create(null, StringType), - Literal("pl"), Literal("123")), null) - checkEvaluation(StringReplace(Literal("replace"), - Literal.create(null, StringType), Literal("123")), null) - checkEvaluation(StringReplace(Literal("replace"), - Literal("pl"), Literal.create(null, StringType)), null) + checkEvaluation(StringReplace(nullString, Literal("pl"), Literal("123")), null) + checkEvaluation(StringReplace(Literal("replace"), nullString, Literal("123")), null) + checkEvaluation(StringReplace(Literal("replace"), Literal("pl"), nullString), null) // test for multiple replace checkEvaluation(StringReplace(Literal("abcabc"), Literal("b"), Literal("12")), "a12ca12c") checkEvaluation(StringReplace(Literal("abcdabcd"), Literal("bc"), Literal("")), "adad") + // tests for single character search and replacement strings + checkEvaluation(StringReplace(Literal("abcabc"), Literal("a"), Literal("A")), "AbcAbc") + checkEvaluation(StringReplace(Literal("abcabc"), Literal("Z"), Literal("A")), "abcabc") // scalastyle:off // non ascii characters are not allowed in the source code, so we disable the scalastyle. checkEvaluation(StringReplace(Literal("花花世界"), Literal("花世"), Literal("ab")), "花ab界") From b06d9174de5508aae8320ffe30abf25ad104b11c Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 24 May 2019 23:19:28 -0700 Subject: [PATCH 2/7] Can't interpolate as literals due to Scala string escaping issue. --- .../apache/spark/unsafe/types/UTF8String.java | 22 +++++++------------ .../expressions/stringExpressions.scala | 17 +++----------- .../expressions/StringExpressionsSuite.scala | 1 + 3 files changed, 12 insertions(+), 28 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 1462373fca352..c4d62d8ad1d29 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -981,23 +981,17 @@ public UTF8String replace(UTF8String search, UTF8String replace) { } public UTF8String replace(String search, String replace) { - // In Java 8, String.replace() is implemented using a regex and is therefore - // somewhat inefficient (see https://bugs.openjdk.java.net/browse/JDK-8058779). - // This is fixed in Java 9, but in Java 8 we can use Commons StringUtils instead: String before = toString(); - String after = StringUtils.replace(before, search, replace); - // Use reference equality to cheaply detect whether the replacement had no effect, - // in which case we can simply return the original UTF8String and save some copying. - if (before == after) { - return this; + String after; + if (search.length() == 1 && replace.length() == 1) { + // Use single-character-replacement fast path + after = before.replace(search.charAt(0), replace.charAt(0)); } else { - return fromString(after); + // In Java 8, String.replace() is implemented using a regex and is therefore + // somewhat inefficient (see https://bugs.openjdk.java.net/browse/JDK-8058779). + // This is fixed in Java 9, but in Java 8 we can use Commons StringUtils instead: + after = StringUtils.replace(before, search, replace); } - } - - public UTF8String replace(char search, char replace) { - String before = toString(); - String after = before.replace(search, replace); // Use reference equality to cheaply detect whether the replacement had no effect, // in which case we can simply return the original UTF8String and save some copying. if (before == after) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index d38c4bbfbc2a1..442a8e09c4aaa 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -25,7 +25,6 @@ import java.util.regex.Pattern import scala.collection.mutable.ArrayBuffer import org.apache.commons.codec.binary.{Base64 => CommonsBase64} -import org.apache.commons.lang3.StringEscapeUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult @@ -458,18 +457,8 @@ case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExp } else { val searchStr: String = search.asInstanceOf[UTF8String].toString val replaceStr: String = replace.asInstanceOf[UTF8String].toString - val escapedSearchStr = StringEscapeUtils.escapeJava(searchStr) - val escapedReplaceStr = StringEscapeUtils.escapeJava(replaceStr) - val q: String = { - if (searchStr.length == 1 && replaceStr.length == 1) { - // Both strings are single characters, so embed them as character literals - // in order to use a single-character-replacement fast path. - "'" - } else { - // Use slower string literal replacement path (double quotes for string literal) - "\"" - } - } + val searchStrRef = ctx.addReferenceObj("searchStr", searchStr, "String") + val replaceStrRef = ctx.addReferenceObj("searchStr", replaceStr, "String") // We don't use nullSafeCodeGen here because we don't want to re-evaluate the search // and replace expressions. val eval = srcExpr.genCode(ctx) @@ -478,7 +467,7 @@ case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExp boolean ${ev.isNull} = ${eval.isNull}; ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; if (!${ev.isNull}) { - ${ev.value} = ${eval.value}.replace($q$escapedSearchStr$q, $q$escapedReplaceStr$q); + ${ev.value} = ${eval.value}.replace($searchStrRef, $replaceStrRef); } """) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 15c7b7e24fd24..68cf6ed892b5d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -425,6 +425,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // scalastyle:off // non ascii characters are not allowed in the source code, so we disable the scalastyle. checkEvaluation(StringReplace(Literal("花花世界"), Literal("花世"), Literal("ab")), "花ab界") + checkEvaluation(StringReplace(Literal("火"), Literal("火"), Literal("水")), "水") // scalastyle:on } From b51035b5175884ffc6b06dc3307f7d70d224525c Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 26 May 2019 13:09:57 -0700 Subject: [PATCH 3/7] Correct referenceObj variable naming typo --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 442a8e09c4aaa..b5448753df943 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -458,7 +458,7 @@ case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExp val searchStr: String = search.asInstanceOf[UTF8String].toString val replaceStr: String = replace.asInstanceOf[UTF8String].toString val searchStrRef = ctx.addReferenceObj("searchStr", searchStr, "String") - val replaceStrRef = ctx.addReferenceObj("searchStr", replaceStr, "String") + val replaceStrRef = ctx.addReferenceObj("replaceStr", replaceStr, "String") // We don't use nullSafeCodeGen here because we don't want to re-evaluate the search // and replace expressions. val eval = srcExpr.genCode(ctx) From 6fd7714ee0ffb1732faf70783751f04b8de001f1 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 27 May 2019 12:03:23 -0700 Subject: [PATCH 4/7] Implement direct replace() over UTF8String bytes. --- .../spark/unsafe}/UTF8StringBuilder.java | 27 +++++++++++-- .../apache/spark/unsafe/types/UTF8String.java | 25 +++++++++++- .../spark/unsafe/types/UTF8StringSuite.java | 38 +++++++++++++++++++ .../spark/sql/catalyst/expressions/Cast.scala | 1 + .../expressions/collectionOperations.scala | 1 + 5 files changed, 87 insertions(+), 5 deletions(-) rename {sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen => common/unsafe/src/main/java/org/apache/spark/unsafe}/UTF8StringBuilder.java (80%) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java similarity index 80% rename from sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java rename to common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java index f0f66bae245fd..481ea89090b2a 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java @@ -15,9 +15,8 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.expressions.codegen; +package org.apache.spark.unsafe; -import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.types.UTF8String; @@ -34,7 +33,18 @@ public class UTF8StringBuilder { public UTF8StringBuilder() { // Since initial buffer size is 16 in `StringBuilder`, we set the same size here - this.buffer = new byte[16]; + this(16); + } + + public UTF8StringBuilder(int initialSize) { + if (initialSize < 0) { + throw new IllegalArgumentException("Size must be non-negative"); + } + if (initialSize > ARRAY_MAX) { + throw new IllegalArgumentException( + "Size " + initialSize + " exceeded maximum size of " + ARRAY_MAX); + } + this.buffer = new byte[initialSize]; } // Grows the buffer by at least `neededSize` @@ -72,6 +82,17 @@ public void append(String value) { append(UTF8String.fromString(value)); } + public void appendBytes(Object base, long offset, int length) { + grow(length); + Platform.copyMemory( + base, + offset, + buffer, + cursor, + length); + cursor += length; + } + public UTF8String build() { return UTF8String.fromBytes(buffer, 0, totalSize()); } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c4d62d8ad1d29..78051df9a5226 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -33,6 +33,7 @@ import com.google.common.primitives.Ints; import org.apache.spark.unsafe.Platform; +import org.apache.spark.unsafe.UTF8StringBuilder; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.hash.Murmur3_x86_32; @@ -974,10 +975,30 @@ public UTF8String[] split(UTF8String pattern, int limit) { } public UTF8String replace(UTF8String search, UTF8String replace) { - if (EMPTY_UTF8.equals(search)) { + // This implementation loosely based on commons-lang3's StringUtils.replace(). + if (numBytes == 0 || search.numBytes == 0) { return this; } - return replace(search.toString(), replace.toString()); + // Find the first occurrence of the search string. + int start = 0; + int end = this.find(search, start); + if (end == -1) { + // Search string was not found, so string is unchanged. + return this; + } + // At least one match was found. Estimate space needed for result. + int increase = replace.numBytes - search.numBytes; + increase = increase < 0 ? 0 : increase; + increase *= 16; + final UTF8StringBuilder buf = new UTF8StringBuilder(numBytes + increase); + while (end != -1) { + buf.appendBytes(this.base, this.offset + start, end - start); + buf.append(replace); + start = end + search.numBytes; + end = this.find(search, start); + } + buf.appendBytes(this.base, this.offset + start, numBytes - start); + return buf.build(); } public UTF8String replace(String search, String replace) { diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index cf9cc6b1800a9..bc75fa9e724a0 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -403,6 +403,44 @@ public void split() { new UTF8String[]{fromString("ab"), fromString("def,ghi,")})); } + @Test + public void replace() { + assertEquals( + fromString("re123ace"), + fromString("replace").replace(fromString("pl"), fromString("123"))); + assertEquals( + fromString("reace"), + fromString("replace").replace(fromString("pl"), fromString(""))); + assertEquals( + fromString("replace"), + fromString("replace").replace(fromString(""), fromString("123"))); + // tests for multiple replacements + assertEquals( + fromString("a12ca12c"), + fromString("abcabc").replace(fromString("b"), fromString("12"))); + assertEquals( + fromString("adad"), + fromString("abcdabcd").replace(fromString("bc"), fromString(""))); + // tests for single character search and replacement strings + assertEquals( + fromString("AbcAbc"), + fromString("abcabc").replace(fromString("a"), fromString("A"))); + assertEquals( + fromString("abcabc"), + fromString("abcabc").replace(fromString("Z"), fromString("A"))); + // Tests with non-ASCII characters + assertEquals( + fromString("花ab界"), + fromString("花花世界").replace(fromString("花世"), fromString("ab"))); + assertEquals( + fromString("a水c"), + fromString("a火c").replace(fromString("火"), fromString("水"))); + // Tests for a large number of replacements, triggering UTF8StringBuilder resize + assertEquals( + fromString("abcd").repeat(17), + fromString("a").repeat(17).replace(fromString("a"), fromString("abcd"))); + } + @Test public void levenshteinDistance() { assertEquals(0, EMPTY_UTF8.levenshteinDistance(EMPTY_UTF8)); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index f8c1102953ab3..969128838eba4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.UTF8StringBuilder import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} import org.apache.spark.unsafe.types.UTF8String.{IntWrapper, LongWrapper} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 41d9b06ed1d01..8477e63135e30 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.UTF8StringBuilder import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH import org.apache.spark.unsafe.types.{ByteArray, UTF8String} From ec423f1cd1eaf99fad247dafcabc4adc7a7fa019 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 27 May 2019 12:04:02 -0700 Subject: [PATCH 5/7] Roll back codegen changes. --- common/unsafe/pom.xml | 9 +++-- .../apache/spark/unsafe/types/UTF8String.java | 22 ----------- .../expressions/stringExpressions.scala | 37 ++----------------- .../expressions/StringExpressionsSuite.scala | 17 ++++----- 4 files changed, 16 insertions(+), 69 deletions(-) diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml index 024a907f70809..93a4f67fd23f2 100644 --- a/common/unsafe/pom.xml +++ b/common/unsafe/pom.xml @@ -65,10 +65,6 @@ com.google.guava guava - - org.apache.commons - commons-lang3 - @@ -88,6 +84,11 @@ scalacheck_${scala.binary.version} test + + org.apache.commons + commons-lang3 + test + org.apache.commons commons-text diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 78051df9a5226..1fa8200b4a3f4 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -25,7 +25,6 @@ import java.util.Arrays; import java.util.Map; -import org.apache.commons.lang3.StringUtils; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.KryoSerializable; import com.esotericsoftware.kryo.io.Input; @@ -1001,27 +1000,6 @@ public UTF8String replace(UTF8String search, UTF8String replace) { return buf.build(); } - public UTF8String replace(String search, String replace) { - String before = toString(); - String after; - if (search.length() == 1 && replace.length() == 1) { - // Use single-character-replacement fast path - after = before.replace(search.charAt(0), replace.charAt(0)); - } else { - // In Java 8, String.replace() is implemented using a regex and is therefore - // somewhat inefficient (see https://bugs.openjdk.java.net/browse/JDK-8058779). - // This is fixed in Java 9, but in Java 8 we can use Commons StringUtils instead: - after = StringUtils.replace(before, search, replace); - } - // Use reference equality to cheaply detect whether the replacement had no effect, - // in which case we can simply return the original UTF8String and save some copying. - if (before == after) { - return this; - } else { - return fromString(after); - } - } - // TODO: Need to use `Code Point` here instead of Char in case the character longer than 2 bytes public UTF8String translate(Map dict) { String srcStr = this.toString(); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index b5448753df943..6145ee676047d 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -443,40 +443,9 @@ case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExp } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - if (searchExpr.foldable && replaceExpr.foldable) { - // The search and replacement strings are constants, so we can use a more optimized path - // which avoids repeatedly decoding those UTF8Strings. - val search = searchExpr.eval() - val replace = replaceExpr.eval() - if (search == null || replace == null) { - // Either the search or replacement is null, so the entire expression evaluates to null - ev.copy(code = code""" - boolean ${ev.isNull} = true; - ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; - """) - } else { - val searchStr: String = search.asInstanceOf[UTF8String].toString - val replaceStr: String = replace.asInstanceOf[UTF8String].toString - val searchStrRef = ctx.addReferenceObj("searchStr", searchStr, "String") - val replaceStrRef = ctx.addReferenceObj("replaceStr", replaceStr, "String") - // We don't use nullSafeCodeGen here because we don't want to re-evaluate the search - // and replace expressions. - val eval = srcExpr.genCode(ctx) - ev.copy(code = code""" - ${eval.code} - boolean ${ev.isNull} = ${eval.isNull}; - ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; - if (!${ev.isNull}) { - ${ev.value} = ${eval.value}.replace($searchStrRef, $replaceStrRef); - } - """) - } - } else { - // Either the search or replace expression is non-foldable, so use a slower path: - nullSafeCodeGen(ctx, ev, (src, search, replace) => { - s"""${ev.value} = $src.replace($search, $replace);""" - }) - } + nullSafeCodeGen(ctx, ev, (src, search, replace) => { + s"""${ev.value} = $src.replace($search, $replace);""" + }) } override def dataType: DataType = StringType diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 68cf6ed892b5d..1e7737b6a88ef 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -409,23 +409,22 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("replace") { - val nullString = Literal.create(null, StringType) - checkEvaluation(StringReplace(Literal("replace"), Literal("pl"), Literal("123")), "re123ace") + checkEvaluation( + StringReplace(Literal("replace"), Literal("pl"), Literal("123")), "re123ace") checkEvaluation(StringReplace(Literal("replace"), Literal("pl"), Literal("")), "reace") checkEvaluation(StringReplace(Literal("replace"), Literal(""), Literal("123")), "replace") - checkEvaluation(StringReplace(nullString, Literal("pl"), Literal("123")), null) - checkEvaluation(StringReplace(Literal("replace"), nullString, Literal("123")), null) - checkEvaluation(StringReplace(Literal("replace"), Literal("pl"), nullString), null) + checkEvaluation(StringReplace(Literal.create(null, StringType), + Literal("pl"), Literal("123")), null) + checkEvaluation(StringReplace(Literal("replace"), + Literal.create(null, StringType), Literal("123")), null) + checkEvaluation(StringReplace(Literal("replace"), + Literal("pl"), Literal.create(null, StringType)), null) // test for multiple replace checkEvaluation(StringReplace(Literal("abcabc"), Literal("b"), Literal("12")), "a12ca12c") checkEvaluation(StringReplace(Literal("abcdabcd"), Literal("bc"), Literal("")), "adad") - // tests for single character search and replacement strings - checkEvaluation(StringReplace(Literal("abcabc"), Literal("a"), Literal("A")), "AbcAbc") - checkEvaluation(StringReplace(Literal("abcabc"), Literal("Z"), Literal("A")), "abcabc") // scalastyle:off // non ascii characters are not allowed in the source code, so we disable the scalastyle. checkEvaluation(StringReplace(Literal("花花世界"), Literal("花世"), Literal("ab")), "花ab界") - checkEvaluation(StringReplace(Literal("火"), Literal("火"), Literal("水")), "水") // scalastyle:on } From 8123e42420435c1b234825adf02c19fae16aa4a2 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 9 Jun 2019 15:36:57 -0700 Subject: [PATCH 6/7] Fix comment typo --- .../src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 1fa8200b4a3f4..33580056dada7 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -974,7 +974,7 @@ public UTF8String[] split(UTF8String pattern, int limit) { } public UTF8String replace(UTF8String search, UTF8String replace) { - // This implementation loosely based on commons-lang3's StringUtils.replace(). + // This implementation is loosely based on commons-lang3's StringUtils.replace(). if (numBytes == 0 || search.numBytes == 0) { return this; } From 6188dcdd321967eb78535d7cf55b73f20047f449 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 9 Jun 2019 15:39:37 -0700 Subject: [PATCH 7/7] Remove ternary operator --- .../main/java/org/apache/spark/unsafe/types/UTF8String.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 33580056dada7..00c98c91a6d7f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -986,9 +986,8 @@ public UTF8String replace(UTF8String search, UTF8String replace) { return this; } // At least one match was found. Estimate space needed for result. - int increase = replace.numBytes - search.numBytes; - increase = increase < 0 ? 0 : increase; - increase *= 16; + // The 16x multiplier here is chosen to match commons-lang3's implementation. + int increase = Math.max(0, replace.numBytes - search.numBytes) * 16; final UTF8StringBuilder buf = new UTF8StringBuilder(numBytes + increase); while (end != -1) { buf.appendBytes(this.base, this.offset + start, end - start);