Skip to content

Commit 54db7cb

Browse files
original-brownbearSumedh Wale
authored andcommitted
[SPARK-21967][CORE] org.apache.spark.unsafe.types.UTF8String#compareTo Should Compare 8 Bytes at a Time for Better Performance
* Using 64 bit unsigned long comparison instead of unsigned int comparison in `org.apache.spark.unsafe.types.UTF8String#compareTo` for better performance. * Making `IS_LITTLE_ENDIAN` a constant for correctness reasons (shouldn't use a non-constant in `compareTo` implementations and it def. is a constant per JVM) Build passes and the functionality is widely covered by existing tests as far as I can see. Author: Armin <[email protected]> Closes apache#19180 from original-brownbear/SPARK-21967.
1 parent 0f7cea6 commit 54db7cb

File tree

1 file changed

+21
-69
lines changed

1 file changed

+21
-69
lines changed

common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

Lines changed: 21 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
6868
5, 5, 5, 5,
6969
6, 6};
7070

71-
private static final boolean isLittleEndian = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
71+
private static final boolean IS_LITTLE_ENDIAN =
72+
ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
7273

7374
private static final UTF8String COMMA_UTF8 = UTF8String.fromString(",");
7475
public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");
@@ -195,7 +196,7 @@ public long getPrefix() {
195196
// After getting the data, we use a mask to mask out data that is not part of the string.
196197
long p;
197198
long mask = 0;
198-
if (isLittleEndian) {
199+
if (IS_LITTLE_ENDIAN) {
199200
if (numBytes >= 8) {
200201
p = Platform.getLong(base, offset);
201202
} else if (numBytes > 4) {
@@ -1054,77 +1055,28 @@ public int compareTo(@Nonnull final UTF8String other) {
10541055
return compare(other);
10551056
}
10561057

1057-
/** Read integer in big-endian format */
1058-
static int getIntBigEndian(final Object base, final long offset) {
1059-
return isLittleEndian ? Integer.reverseBytes(Platform.getInt(base, offset))
1060-
: Platform.getInt(base, offset);
1061-
}
1062-
1063-
/** Read long in big-endian format */
1064-
static long getLongBigEndian(final Object base, final long offset) {
1065-
return isLittleEndian ? Long.reverseBytes(Platform.getLong(base, offset))
1066-
: Platform.getLong(base, offset);
1067-
}
1068-
10691058
public final int compare(final UTF8String other) {
1070-
final Object rightBase = other.getBaseObject();
1071-
long rightOffset = other.getBaseOffset();
1072-
final Object leftBase = base;
1073-
long leftOffset = offset;
1074-
1075-
final int len = Math.min(numBytes, other.numBytes);
1076-
1077-
// for the case that compare will fail in first few bytes itself, the overhead
1078-
// of JNI call is too high
1079-
/*
1080-
// noinspection ConstantConditions
1081-
if (leftBase == null && rightBase == null &&
1082-
len >= Native.MIN_JNI_SIZE && Native.isLoaded()) {
1083-
final int result = Native.compareString(leftOffset, rightOffset, len);
1084-
return result != 0 ? result : (numBytes - other.numBytes);
1085-
}
1086-
*/
1087-
1088-
long endOffset = leftOffset + len;
1089-
// for architectures that support unaligned accesses, read 8 bytes at a time
1090-
if (Platform.unaligned() || (((leftOffset & 0x7) == 0) && ((rightOffset & 0x7) == 0))) {
1091-
endOffset -= 8;
1092-
while (leftOffset <= endOffset) {
1093-
// In UTF-8, the byte should be unsigned, so we should compare them as unsigned long.
1094-
final long ll = getLongBigEndian(leftBase, leftOffset);
1095-
final long rl = getLongBigEndian(rightBase, rightOffset);
1096-
final long res = ll - rl;
1097-
// If the sign of both values is same then "res" is with correct sign.
1098-
// If the sign of values is different then "res" has opposite sign.
1099-
// The XOR operations will revert the sign bit of res if sign of values is different.
1100-
// After that converting to signum is "(1 + ((v >> 63) << 1))"
1101-
// where (v >> 63) will flow the sign to give -1 or 0, and (1 + 2 times)
1102-
// of that will give -1 or 1 respectively.
1103-
if (res != 0) return (int)(1 + (((ll ^ rl ^ res) >> 63) << 1));
1104-
leftOffset += 8;
1105-
rightOffset += 8;
1106-
}
1107-
endOffset += 4;
1108-
if (leftOffset <= endOffset) {
1109-
// In UTF-8, the byte should be unsigned, so we should compare them as unsigned int
1110-
// which is done by converting to unsigned longs.
1111-
// After that conversion to signed integer is "(1 + ((v >> 63) << 1))" as above.
1112-
final long res = (getIntBigEndian(leftBase, leftOffset) & 0xffffffffL) -
1113-
(getIntBigEndian(rightBase, rightOffset) & 0xffffffffL);
1114-
if (res != 0) return (int)(1 + ((res >> 63) << 1));
1115-
leftOffset += 4;
1116-
rightOffset += 4;
1059+
int len = Math.min(numBytes, other.numBytes);
1060+
int wordMax = (len / 8) * 8;
1061+
long roffset = other.offset;
1062+
Object rbase = other.base;
1063+
for (int i = 0; i < wordMax; i += 8) {
1064+
long left = getLong(base, offset + i);
1065+
long right = getLong(rbase, roffset + i);
1066+
if (left != right) {
1067+
if (IS_LITTLE_ENDIAN) {
1068+
return Long.compareUnsigned(Long.reverseBytes(left), Long.reverseBytes(right));
1069+
} else {
1070+
return Long.compareUnsigned(left, right);
1071+
}
11171072
}
1118-
endOffset += 4;
11191073
}
1120-
// finish the remaining bytes
1121-
while (leftOffset < endOffset) {
1074+
for (int i = wordMax; i < len; i++) {
11221075
// In UTF-8, the byte should be unsigned, so we should compare them as unsigned int.
1123-
final int res = (Platform.getByte(leftBase, leftOffset) & 0xff) -
1124-
(Platform.getByte(rightBase, rightOffset) & 0xff);
1125-
if (res != 0) return res;
1126-
leftOffset++;
1127-
rightOffset++;
1076+
int res = (getByte(i) & 0xFF) - (Platform.getByte(rbase, roffset + i) & 0xFF);
1077+
if (res != 0) {
1078+
return res;
1079+
}
11281080
}
11291081
return numBytes - other.numBytes;
11301082
}

0 commit comments

Comments
 (0)