diff --git a/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java b/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java
index 812085f07199..eb3bdcef692c 100644
--- a/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java
+++ b/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java
@@ -28,12 +28,23 @@ public class BinaryUtil {
private BinaryUtil() {
}
+ private static final ByteBuffer EMPTY_BYTE_BUFFER = ByteBuffer.allocate(0);
+
/**
- * Truncates the input byte buffer to the given length
+ * Truncates the input byte buffer to the given length.
+ *
+ * We allow for a length of zero so that rows with empty string can be evaluated.
+ * Partition specs still cannot be created with a length of zero due to a constraint
+ * when parsing column truncation specs in {@code org.apache.iceberg.MetricsModes}.
+ *
+ * @param input The ByteBuffer to be truncated
+ * @param length The non-negative length to truncate input to
*/
public static ByteBuffer truncateBinary(ByteBuffer input, int length) {
- Preconditions.checkArgument(length > 0, "Truncate length should be positive");
- if (length >= input.remaining()) {
+ Preconditions.checkArgument(length >= 0, "Truncate length should be non-negative");
+ if (length == 0) {
+ return EMPTY_BYTE_BUFFER;
+ } else if (length >= input.remaining()) {
return input;
}
byte[] array = new byte[length];
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
index 56135c8a331b..4cb6376def7d 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
@@ -69,7 +69,8 @@ public class TestInclusiveMetricsEvaluator {
optional(10, "all_nulls_double", Types.DoubleType.get()),
optional(11, "all_nans_v1_stats", Types.FloatType.get()),
optional(12, "nan_and_null_only", Types.DoubleType.get()),
- optional(13, "no_nan_stats", Types.DoubleType.get())
+ optional(13, "no_nan_stats", Types.DoubleType.get()),
+ optional(14, "some_empty", Types.StringType.get())
);
private static final int INT_MIN_VALUE = 30;
@@ -88,6 +89,7 @@ public class TestInclusiveMetricsEvaluator {
.put(11, 50L)
.put(12, 50L)
.put(13, 50L)
+ .put(14, 50L)
.build(),
// null value counts
ImmutableMap.builder()
@@ -97,6 +99,7 @@ public class TestInclusiveMetricsEvaluator {
.put(10, 50L)
.put(11, 0L)
.put(12, 1L)
+ .put(14, 0L)
.build(),
// nan value counts
ImmutableMap.of(
@@ -107,12 +110,14 @@ public class TestInclusiveMetricsEvaluator {
ImmutableMap.of(
1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
- 12, toByteBuffer(Types.DoubleType.get(), Double.NaN)),
+ 12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
+ 14, toByteBuffer(Types.StringType.get(), "")),
// upper bounds
ImmutableMap.of(
1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
- 12, toByteBuffer(Types.DoubleType.get(), Double.NaN)));
+ 12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
+ 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室")));
private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50,
// any value counts, including nulls
@@ -524,6 +529,12 @@ public void testStringStartsWith() {
shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "3str3x"), true).eval(FILE_3);
Assert.assertFalse("Should not read: range doesn't match", shouldRead);
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("some_empty", "房东整租霍"), true).eval(FILE);
+ Assert.assertTrue("Should read: range matches", shouldRead);
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("all_nulls", ""), true).eval(FILE);
+ Assert.assertFalse("Should not read: range doesn't match", shouldRead);
+
String aboveMax = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString();
shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", aboveMax), true).eval(FILE_4);
Assert.assertFalse("Should not read: range doesn't match", shouldRead);
diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
index af304dac3580..f006b1d848f2 100644
--- a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
+++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
@@ -25,6 +25,7 @@
import org.junit.Assert;
import org.junit.Test;
+import static org.apache.iceberg.util.BinaryUtil.truncateBinary;
import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMax;
import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMin;
import static org.apache.iceberg.util.UnicodeUtil.truncateStringMax;
@@ -32,6 +33,29 @@
@SuppressWarnings("checkstyle:LocalVariableName")
public class TestMetricsTruncation {
+
+ @Test
+ public void testTruncateBinary() {
+ ByteBuffer original = ByteBuffer.wrap(new byte[]{1, 1, (byte) 0xFF, 2});
+ ByteBuffer emptyByteBuffer = ByteBuffer.allocate(0);
+ Comparator cmp = Literal.of(original).comparator();
+
+ Assert.assertEquals("Truncating to a length of zero should return an empty ByteBuffer",
+ 0, cmp.compare(truncateBinary(original, 0), emptyByteBuffer));
+ Assert.assertEquals("Truncating to the original buffer's remaining size should return the original buffer",
+ original, truncateBinary(original, original.remaining()));
+ Assert.assertEquals("Truncating with a length greater than the input's remaining size should return the input",
+ original, truncateBinary(original, 16));
+ ByteBuffer truncated = truncateBinary(original, 2);
+ Assert.assertTrue("Truncating with a length less than the input's remaining size should truncate properly",
+ truncated.remaining() == 2 && truncated.position() == 0);
+ Assert.assertTrue("Truncating should not modify the input buffer",
+ original.remaining() == 4 && original.position() == 0);
+ AssertHelpers.assertThrows("Should not allow a negative truncation length",
+ IllegalArgumentException.class, "length should be non-negative",
+ () -> truncateBinary(original, -1));
+ }
+
@Test
public void testTruncateBinaryMin() {
ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2});
diff --git a/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
index ad269107dc22..22e023f9efd3 100644
--- a/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
+++ b/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
@@ -575,7 +575,7 @@ private List testRecords(Schema schema) {
return Lists.newArrayList(
record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"),
record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"),
- record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"),
+ record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""),
record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"),
record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"),
record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"),
diff --git a/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
index d4d9eafefdf0..b293e30dcd38 100644
--- a/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
+++ b/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
@@ -529,7 +529,7 @@ private List testRecords(Schema schema) {
return Lists.newArrayList(
record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"),
record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"),
- record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"),
+ record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""),
record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"),
record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"),
record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"),