diff --git a/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java b/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java index 812085f07199..eb3bdcef692c 100644 --- a/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java @@ -28,12 +28,23 @@ public class BinaryUtil { private BinaryUtil() { } + private static final ByteBuffer EMPTY_BYTE_BUFFER = ByteBuffer.allocate(0); + /** - * Truncates the input byte buffer to the given length + * Truncates the input byte buffer to the given length. + *

+ * We allow for a length of zero so that rows with empty string can be evaluated. + * Partition specs still cannot be created with a length of zero due to a constraint + * when parsing column truncation specs in {@code org.apache.iceberg.MetricsModes}. + * + * @param input The ByteBuffer to be truncated + * @param length The non-negative length to truncate input to */ public static ByteBuffer truncateBinary(ByteBuffer input, int length) { - Preconditions.checkArgument(length > 0, "Truncate length should be positive"); - if (length >= input.remaining()) { + Preconditions.checkArgument(length >= 0, "Truncate length should be non-negative"); + if (length == 0) { + return EMPTY_BYTE_BUFFER; + } else if (length >= input.remaining()) { return input; } byte[] array = new byte[length]; diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 56135c8a331b..4cb6376def7d 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -69,7 +69,8 @@ public class TestInclusiveMetricsEvaluator { optional(10, "all_nulls_double", Types.DoubleType.get()), optional(11, "all_nans_v1_stats", Types.FloatType.get()), optional(12, "nan_and_null_only", Types.DoubleType.get()), - optional(13, "no_nan_stats", Types.DoubleType.get()) + optional(13, "no_nan_stats", Types.DoubleType.get()), + optional(14, "some_empty", Types.StringType.get()) ); private static final int INT_MIN_VALUE = 30; @@ -88,6 +89,7 @@ public class TestInclusiveMetricsEvaluator { .put(11, 50L) .put(12, 50L) .put(13, 50L) + .put(14, 50L) .build(), // null value counts ImmutableMap.builder() @@ -97,6 +99,7 @@ public class TestInclusiveMetricsEvaluator { .put(10, 50L) .put(11, 0L) .put(12, 1L) + .put(14, 0L) .build(), // nan value counts ImmutableMap.of( @@ -107,12 +110,14 @@ public class TestInclusiveMetricsEvaluator { ImmutableMap.of( 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), 11, toByteBuffer(Types.FloatType.get(), Float.NaN), - 12, toByteBuffer(Types.DoubleType.get(), Double.NaN)), + 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), + 14, toByteBuffer(Types.StringType.get(), "")), // upper bounds ImmutableMap.of( 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), 11, toByteBuffer(Types.FloatType.get(), Float.NaN), - 12, toByteBuffer(Types.DoubleType.get(), Double.NaN))); + 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), + 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"))); private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50, // any value counts, including nulls @@ -524,6 +529,12 @@ public void testStringStartsWith() { shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "3str3x"), true).eval(FILE_3); Assert.assertFalse("Should not read: range doesn't match", shouldRead); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("some_empty", "房东整租霍"), true).eval(FILE); + Assert.assertTrue("Should read: range matches", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("all_nulls", ""), true).eval(FILE); + Assert.assertFalse("Should not read: range doesn't match", shouldRead); + String aboveMax = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString(); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", aboveMax), true).eval(FILE_4); Assert.assertFalse("Should not read: range doesn't match", shouldRead); diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java index af304dac3580..f006b1d848f2 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java @@ -25,6 +25,7 @@ import org.junit.Assert; import org.junit.Test; +import static org.apache.iceberg.util.BinaryUtil.truncateBinary; import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMax; import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMin; import static org.apache.iceberg.util.UnicodeUtil.truncateStringMax; @@ -32,6 +33,29 @@ @SuppressWarnings("checkstyle:LocalVariableName") public class TestMetricsTruncation { + + @Test + public void testTruncateBinary() { + ByteBuffer original = ByteBuffer.wrap(new byte[]{1, 1, (byte) 0xFF, 2}); + ByteBuffer emptyByteBuffer = ByteBuffer.allocate(0); + Comparator cmp = Literal.of(original).comparator(); + + Assert.assertEquals("Truncating to a length of zero should return an empty ByteBuffer", + 0, cmp.compare(truncateBinary(original, 0), emptyByteBuffer)); + Assert.assertEquals("Truncating to the original buffer's remaining size should return the original buffer", + original, truncateBinary(original, original.remaining())); + Assert.assertEquals("Truncating with a length greater than the input's remaining size should return the input", + original, truncateBinary(original, 16)); + ByteBuffer truncated = truncateBinary(original, 2); + Assert.assertTrue("Truncating with a length less than the input's remaining size should truncate properly", + truncated.remaining() == 2 && truncated.position() == 0); + Assert.assertTrue("Truncating should not modify the input buffer", + original.remaining() == 4 && original.position() == 0); + AssertHelpers.assertThrows("Should not allow a negative truncation length", + IllegalArgumentException.class, "length should be non-negative", + () -> truncateBinary(original, -1)); + } + @Test public void testTruncateBinaryMin() { ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2}); diff --git a/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index ad269107dc22..22e023f9efd3 100644 --- a/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -575,7 +575,7 @@ private List testRecords(Schema schema) { return Lists.newArrayList( record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"), record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"), - record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"), + record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""), record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"), record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"), record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"), diff --git a/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index d4d9eafefdf0..b293e30dcd38 100644 --- a/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -529,7 +529,7 @@ private List testRecords(Schema schema) { return Lists.newArrayList( record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"), record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"), - record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"), + record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""), record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"), record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"), record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"),