Skip to content

Commit 07fd7d3

Browse files
committed
[SPARK-9460] Avoid byte array allocation in StringPrefixComparator.
As of today, StringPrefixComparator converts the long values back to byte arrays in order to compare them. This patch optimizes this to compare the longs directly, rather than turning the longs into byte arrays and comparing them byte by byte (unsigned). This only works on little-endian architecture right now. Author: Reynold Xin <[email protected]> Closes #7765 from rxin/SPARK-9460 and squashes the following commits: e4908cc [Reynold Xin] Stricter randomized tests. 4c8d094 [Reynold Xin] [SPARK-9460] Avoid byte array allocation in StringPrefixComparator.
1 parent 9514d87 commit 07fd7d3

File tree

4 files changed

+36
-32
lines changed

4 files changed

+36
-32
lines changed

core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717

1818
package org.apache.spark.util.collection.unsafe.sort;
1919

20-
import com.google.common.base.Charsets;
21-
import com.google.common.primitives.Longs;
22-
import com.google.common.primitives.UnsignedBytes;
20+
import com.google.common.primitives.UnsignedLongs;
2321

2422
import org.apache.spark.annotation.Private;
2523
import org.apache.spark.unsafe.types.UTF8String;
@@ -36,32 +34,11 @@ private PrefixComparators() {}
3634
public static final class StringPrefixComparator extends PrefixComparator {
3735
@Override
3836
public int compare(long aPrefix, long bPrefix) {
39-
// TODO: can done more efficiently
40-
byte[] a = Longs.toByteArray(aPrefix);
41-
byte[] b = Longs.toByteArray(bPrefix);
42-
for (int i = 0; i < 8; i++) {
43-
int c = UnsignedBytes.compare(a[i], b[i]);
44-
if (c != 0) return c;
45-
}
46-
return 0;
47-
}
48-
49-
public long computePrefix(byte[] bytes) {
50-
if (bytes == null) {
51-
return 0L;
52-
} else {
53-
byte[] padded = new byte[8];
54-
System.arraycopy(bytes, 0, padded, 0, Math.min(bytes.length, 8));
55-
return Longs.fromByteArray(padded);
56-
}
57-
}
58-
59-
public long computePrefix(String value) {
60-
return value == null ? 0L : computePrefix(value.getBytes(Charsets.UTF_8));
37+
return UnsignedLongs.compare(aPrefix, bPrefix);
6138
}
6239

6340
public long computePrefix(UTF8String value) {
64-
return value == null ? 0L : computePrefix(value.getBytes());
41+
return value == null ? 0L : value.getPrefix();
6542
}
6643
}
6744

core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,29 @@
1717

1818
package org.apache.spark.util.collection.unsafe.sort
1919

20+
import com.google.common.primitives.UnsignedBytes
2021
import org.scalatest.prop.PropertyChecks
21-
2222
import org.apache.spark.SparkFunSuite
23+
import org.apache.spark.unsafe.types.UTF8String
2324

2425
class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
2526

2627
test("String prefix comparator") {
2728

2829
def testPrefixComparison(s1: String, s2: String): Unit = {
29-
val s1Prefix = PrefixComparators.STRING.computePrefix(s1)
30-
val s2Prefix = PrefixComparators.STRING.computePrefix(s2)
30+
val utf8string1 = UTF8String.fromString(s1)
31+
val utf8string2 = UTF8String.fromString(s2)
32+
val s1Prefix = PrefixComparators.STRING.computePrefix(utf8string1)
33+
val s2Prefix = PrefixComparators.STRING.computePrefix(utf8string2)
3134
val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, s2Prefix)
35+
36+
val cmp = UnsignedBytes.lexicographicalComparator().compare(
37+
utf8string1.getBytes.take(8), utf8string2.getBytes.take(8))
38+
3239
assert(
33-
(prefixComparisonResult == 0) ||
34-
(prefixComparisonResult < 0 && s1 < s2) ||
35-
(prefixComparisonResult > 0 && s1 > s2))
40+
(prefixComparisonResult == 0 && cmp == 0) ||
41+
(prefixComparisonResult < 0 && s1.compareTo(s2) < 0) ||
42+
(prefixComparisonResult > 0 && s1.compareTo(s2) > 0))
3643
}
3744

3845
// scalastyle:off

unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,15 @@ public int numChars() {
137137
return len;
138138
}
139139

140+
/**
141+
* Returns a 64-bit integer that can be used as the prefix used in sorting.
142+
*/
143+
public long getPrefix() {
144+
long p = PlatformDependent.UNSAFE.getLong(base, offset);
145+
p = java.lang.Long.reverseBytes(p);
146+
return p;
147+
}
148+
140149
/**
141150
* Returns the underline bytes, will be a copy of it if it's part of another array.
142151
*/

unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,19 @@ public void emptyStringTest() {
6363
assertEquals(0, EMPTY_UTF8.numBytes());
6464
}
6565

66+
@Test
67+
public void prefix() {
68+
assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
69+
assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
70+
assertTrue(
71+
fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
72+
assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
73+
assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
74+
}
75+
6676
@Test
6777
public void compareTo() {
78+
assertTrue(fromString("").compareTo(fromString("a")) < 0);
6879
assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
6980
assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
7081
assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);

0 commit comments

Comments
 (0)