Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@

package org.apache.spark.util.collection.unsafe.sort;

import com.google.common.base.Charsets;
import com.google.common.primitives.Longs;
import com.google.common.primitives.UnsignedBytes;
import com.google.common.primitives.UnsignedLongs;

import org.apache.spark.annotation.Private;
import org.apache.spark.unsafe.types.UTF8String;
Expand All @@ -37,32 +35,11 @@ private PrefixComparators() {}
public static final class StringPrefixComparator extends PrefixComparator {
@Override
public int compare(long aPrefix, long bPrefix) {
// TODO: can done more efficiently
byte[] a = Longs.toByteArray(aPrefix);
byte[] b = Longs.toByteArray(bPrefix);
for (int i = 0; i < 8; i++) {
int c = UnsignedBytes.compare(a[i], b[i]);
if (c != 0) return c;
}
return 0;
}

public long computePrefix(byte[] bytes) {
if (bytes == null) {
return 0L;
} else {
byte[] padded = new byte[8];
System.arraycopy(bytes, 0, padded, 0, Math.min(bytes.length, 8));
return Longs.fromByteArray(padded);
}
}

public long computePrefix(String value) {
return value == null ? 0L : computePrefix(value.getBytes(Charsets.UTF_8));
return UnsignedLongs.compare(aPrefix, bPrefix);
}

public long computePrefix(UTF8String value) {
return value == null ? 0L : computePrefix(value.getBytes());
return value == null ? 0L : value.getPrefix();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,29 @@

package org.apache.spark.util.collection.unsafe.sort

import com.google.common.primitives.UnsignedBytes
import org.scalatest.prop.PropertyChecks

import org.apache.spark.SparkFunSuite
import org.apache.spark.unsafe.types.UTF8String

class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {

test("String prefix comparator") {

def testPrefixComparison(s1: String, s2: String): Unit = {
val s1Prefix = PrefixComparators.STRING.computePrefix(s1)
val s2Prefix = PrefixComparators.STRING.computePrefix(s2)
val utf8string1 = UTF8String.fromString(s1)
val utf8string2 = UTF8String.fromString(s2)
val s1Prefix = PrefixComparators.STRING.computePrefix(utf8string1)
val s2Prefix = PrefixComparators.STRING.computePrefix(utf8string2)
val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, s2Prefix)

val cmp = UnsignedBytes.lexicographicalComparator().compare(
utf8string1.getBytes.take(8), utf8string2.getBytes.take(8))

assert(
(prefixComparisonResult == 0) ||
(prefixComparisonResult < 0 && s1 < s2) ||
(prefixComparisonResult > 0 && s1 > s2))
(prefixComparisonResult == 0 && cmp == 0) ||
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @JoshRosen

I made the randomized tester to be stricter. Previously it would still pass if prefix always return 0.

(prefixComparisonResult < 0 && s1.compareTo(s2) < 0) ||
(prefixComparisonResult > 0 && s1.compareTo(s2) > 0))
}

// scalastyle:off
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ public int numChars() {
return len;
}

/**
* Returns a 64-bit integer that can be used as the prefix used in sorting.
*/
public long getPrefix() {
long p = PlatformDependent.UNSAFE.getLong(base, offset);
p = java.lang.Long.reverseBytes(p);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @tellison can you take a look at this (or find somebody else) to see if we can fix it by just not doing the reverse bytes in big-endian?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should check the numBytes here, or get random bytes if it's less than 8 bytes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should also test the endian-ness.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created https://github.com/apache/spark/pull/7789/files to make it safer. Still doesn't handle big endian though.

return p;
}

/**
* Returns the underline bytes, will be a copy of it if it's part of another array.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,19 @@ public void emptyStringTest() {
assertEquals(0, EMPTY_UTF8.numBytes());
}

@Test
public void prefix() {
assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
assertTrue(
fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
}

@Test
public void compareTo() {
assertTrue(fromString("").compareTo(fromString("a")) < 0);
assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);
Expand Down