apache · rdblue · Jan 16, 2021 · Jan 13, 2021 · Jan 13, 2021 · Jan 13, 2021
diff --git a/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java b/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java
@@ -28,12 +28,23 @@ public class BinaryUtil {
   private BinaryUtil() {
   }
 
+  private static final ByteBuffer EMPTY_BYTE_BUFFER = ByteBuffer.allocate(0);
+
   /**
-   * Truncates the input byte buffer to the given length
+   * Truncates the input byte buffer to the given length.
+   * <p>
+   * We allow for a length of zero so that rows with empty string can be evaluated.
+   * Partition specs still cannot be created with a length of zero due to a constraint
+   * when parsing column truncation specs in {@code org.apache.iceberg.MetricsModes}.
+   *
+   * @param input The ByteBuffer to be truncated
+   * @param length The non-negative length to truncate input to
    */
   public static ByteBuffer truncateBinary(ByteBuffer input, int length) {
-    Preconditions.checkArgument(length > 0, "Truncate length should be positive");
-    if (length >= input.remaining()) {
+    Preconditions.checkArgument(length >= 0, "Truncate length should be non-negative");
+    if (length == 0) {
+      return EMPTY_BYTE_BUFFER;
+    } else if (length >= input.remaining()) {
       return input;
     }
     byte[] array = new byte[length];

diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
@@ -69,7 +69,8 @@ public class TestInclusiveMetricsEvaluator {
       optional(10, "all_nulls_double", Types.DoubleType.get()),
       optional(11, "all_nans_v1_stats", Types.FloatType.get()),
       optional(12, "nan_and_null_only", Types.DoubleType.get()),
-      optional(13, "no_nan_stats", Types.DoubleType.get())
+      optional(13, "no_nan_stats", Types.DoubleType.get()),
+      optional(14, "some_empty", Types.StringType.get())
   );
 
   private static final int INT_MIN_VALUE = 30;
@@ -88,6 +89,7 @@ public class TestInclusiveMetricsEvaluator {
           .put(11, 50L)
           .put(12, 50L)
           .put(13, 50L)
+          .put(14, 50L)
           .build(),
       // null value counts
       ImmutableMap.<Integer, Long>builder()
@@ -97,6 +99,7 @@ public class TestInclusiveMetricsEvaluator {
           .put(10, 50L)
           .put(11, 0L)
           .put(12, 1L)
+          .put(14, 0L)
           .build(),
       // nan value counts
       ImmutableMap.of(
@@ -107,12 +110,14 @@ public class TestInclusiveMetricsEvaluator {
       ImmutableMap.of(
           1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
           11, toByteBuffer(Types.FloatType.get(), Float.NaN),
-          12, toByteBuffer(Types.DoubleType.get(), Double.NaN)),
+          12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
+          14, toByteBuffer(Types.StringType.get(), "")),
       // upper bounds
       ImmutableMap.of(
           1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE),
           11, toByteBuffer(Types.FloatType.get(), Float.NaN),
-          12, toByteBuffer(Types.DoubleType.get(), Double.NaN)));
+          12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
+          14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室")));
 
   private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50,
       // any value counts, including nulls
@@ -524,6 +529,12 @@ public void testStringStartsWith() {
     shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "3str3x"), true).eval(FILE_3);
     Assert.assertFalse("Should not read: range doesn't match", shouldRead);
 
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("some_empty", "房东整租霍"), true).eval(FILE);
+    Assert.assertTrue("Should read: range matches", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("all_nulls", ""), true).eval(FILE);
+    Assert.assertFalse("Should not read: range doesn't match", shouldRead);
+
     String aboveMax = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString();
     shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", aboveMax), true).eval(FILE_4);
     Assert.assertFalse("Should not read: range doesn't match", shouldRead);

diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
@@ -25,13 +25,37 @@
 import org.junit.Assert;
 import org.junit.Test;
 
+import static org.apache.iceberg.util.BinaryUtil.truncateBinary;
 import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMax;
 import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMin;
 import static org.apache.iceberg.util.UnicodeUtil.truncateStringMax;
 import static org.apache.iceberg.util.UnicodeUtil.truncateStringMin;
 
 @SuppressWarnings("checkstyle:LocalVariableName")
 public class TestMetricsTruncation {
+
+  @Test
+  public void testTruncateBinary() {
+    ByteBuffer original = ByteBuffer.wrap(new byte[]{1, 1, (byte) 0xFF, 2});
+    ByteBuffer emptyByteBuffer =  ByteBuffer.allocate(0);
+    Comparator<ByteBuffer> cmp = Literal.of(original).comparator();
+
+    Assert.assertEquals("Truncating to a length of zero should return an empty ByteBuffer",
+        0, cmp.compare(truncateBinary(original, 0), emptyByteBuffer));
+    Assert.assertEquals("Truncating to the original buffer's remaining size should return the original buffer",
+        original, truncateBinary(original, original.remaining()));
+    Assert.assertEquals("Truncating with a length greater than the input's remaining size should return the input",
+        original, truncateBinary(original, 16));
+    ByteBuffer truncated = truncateBinary(original, 2);
+    Assert.assertTrue("Truncating with a length less than the input's remaining size should truncate properly",
+        truncated.remaining() == 2 && truncated.position() == 0);
+    Assert.assertTrue("Truncating should not modify the input buffer",
+        original.remaining() == 4 && original.position() == 0);
+    AssertHelpers.assertThrows("Should not allow a negative truncation length",
+        IllegalArgumentException.class, "length should be non-negative",
+        () -> truncateBinary(original, -1));
+  }
+
   @Test
   public void testTruncateBinaryMin() {
     ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2});

diff --git a/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark2/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
@@ -575,7 +575,7 @@ private List<Record> testRecords(Schema schema) {
     return Lists.newArrayList(
         record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"),
         record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"),
-        record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"),
+        record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""),
         record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"),
         record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"),
         record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"),

diff --git a/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark3/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java
@@ -529,7 +529,7 @@ private List<Record> testRecords(Schema schema) {
     return Lists.newArrayList(
         record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"),
         record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"),
-        record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"),
+        record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""),
         record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"),
         record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"),
         record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"),