diff --git a/api/src/main/java/org/apache/iceberg/transforms/Bucket.java b/api/src/main/java/org/apache/iceberg/transforms/Bucket.java index 44577278a3ca..b862bfa09d72 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Bucket.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Bucket.java @@ -236,6 +236,12 @@ private BucketString(int numBuckets) { @Override public int hash(CharSequence value) { + for (int i = 0; i < value.length(); i++) { + if (Character.isSurrogate(value.charAt(i))) { + // TODO remove the fallback to this (slower) code path once https://github.com/google/guava/issues/5648 is fixed + return MURMUR3.hashBytes(value.toString().getBytes(StandardCharsets.UTF_8)).asInt(); + } + } return MURMUR3.hashString(value, StandardCharsets.UTF_8).asInt(); } diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java b/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java index 9aeb5239bf40..65aecb02eaea 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java @@ -215,6 +215,25 @@ public void testString() { hashBytes(asBytes), bucketFunc.hash(string)); } + @Test + public void testStringWithSurrogatePair() { + String string = "string with a surrogate pair: 💰"; + Assert.assertNotEquals("string has no surrogate pairs", string.length(), string.codePoints().count()); + byte[] asBytes = string.getBytes(StandardCharsets.UTF_8); + + Bucket bucketFunc = Bucket.get(Types.StringType.get(), 100); + + Assert.assertEquals("String hash should match hash of UTF-8 bytes", + hashBytes(asBytes), bucketFunc.hash(string)); + + Assert.assertNotEquals( + "It looks like Guava has been updated and now contains a fix for " + + "https://github.com/google/guava/issues/5648. Please resolve the TODO in BucketString.hash " + + "and remove this assertion", + hashBytes(asBytes), + MURMUR3.hashString(string, StandardCharsets.UTF_8).asInt()); + } + @Test public void testUtf8() { Utf8 utf8 = new Utf8("string to test murmur3 hash");