Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions api/src/main/java/org/apache/iceberg/util/BinaryUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,23 @@ public class BinaryUtil {
private BinaryUtil() {
}

private static final ByteBuffer EMPTY_BYTE_BUFFER = ByteBuffer.allocate(0);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can make this lazily instantiated if we care. Or just return a new empty byte buffer instance every time in the function, but seems unnecessary to have all of those allocations.


/**
* Truncates the input byte buffer to the given length
* Truncates the input byte buffer to the given length.
* <p>
* We allow for a length of zero so that rows with empty string can be evaluated.
* Partition specs still cannot be created with a length of zero due to a constraint
* when parsing column truncation specs in {@code org.apache.iceberg.MetricsModes}.
*
* @param input The ByteBuffer to be truncated
* @param length The non-negative length to truncate input to
*/
public static ByteBuffer truncateBinary(ByteBuffer input, int length) {
Preconditions.checkArgument(length > 0, "Truncate length should be positive");
if (length >= input.remaining()) {
Preconditions.checkArgument(length >= 0, "Truncate length should be non-negative");
if (length == 0) {
return EMPTY_BYTE_BUFFER;
} else if (length >= input.remaining()) {
return input;
}
byte[] array = new byte[length];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ public class TestInclusiveMetricsEvaluator {
optional(10, "all_nulls_double", Types.DoubleType.get()),
optional(11, "all_nans_v1_stats", Types.FloatType.get()),
optional(12, "nan_and_null_only", Types.DoubleType.get()),
optional(13, "no_nan_stats", Types.DoubleType.get())
optional(13, "no_nan_stats", Types.DoubleType.get()),
optional(14, "some_empty", Types.StringType.get())
);

private static final int INT_MIN_VALUE = 30;
Expand All @@ -88,6 +89,7 @@ public class TestInclusiveMetricsEvaluator {
.put(11, 50L)
.put(12, 50L)
.put(13, 50L)
.put(14, 50L)
.build(),
// null value counts
ImmutableMap.<Integer, Long>builder()
Expand All @@ -97,6 +99,7 @@ public class TestInclusiveMetricsEvaluator {
.put(10, 50L)
.put(11, 0L)
.put(12, 1L)
.put(14, 0L)
.build(),
// nan value counts
ImmutableMap.of(
Expand All @@ -107,12 +110,14 @@ public class TestInclusiveMetricsEvaluator {
ImmutableMap.of(
1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
12, toByteBuffer(Types.DoubleType.get(), Double.NaN)),
12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
14, toByteBuffer(Types.StringType.get(), "")),
// upper bounds
ImmutableMap.of(
1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
12, toByteBuffer(Types.DoubleType.get(), Double.NaN)));
12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室")));

private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50,
// any value counts, including nulls
Expand Down Expand Up @@ -524,6 +529,12 @@ public void testStringStartsWith() {
shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "3str3x"), true).eval(FILE_3);
Assert.assertFalse("Should not read: range doesn't match", shouldRead);

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("some_empty", "房东整租霍"), true).eval(FILE);
Assert.assertTrue("Should read: range matches", shouldRead);

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("all_nulls", ""), true).eval(FILE);
Assert.assertFalse("Should not read: range doesn't match", shouldRead);

String aboveMax = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString();
shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", aboveMax), true).eval(FILE_4);
Assert.assertFalse("Should not read: range doesn't match", shouldRead);
Expand Down
24 changes: 24 additions & 0 deletions core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,37 @@
import org.junit.Assert;
import org.junit.Test;

import static org.apache.iceberg.util.BinaryUtil.truncateBinary;
import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMax;
import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMin;
import static org.apache.iceberg.util.UnicodeUtil.truncateStringMax;
import static org.apache.iceberg.util.UnicodeUtil.truncateStringMin;

@SuppressWarnings("checkstyle:LocalVariableName")
public class TestMetricsTruncation {

@Test
public void testTruncateBinary() {
ByteBuffer original = ByteBuffer.wrap(new byte[]{1, 1, (byte) 0xFF, 2});
ByteBuffer emptyByteBuffer = ByteBuffer.allocate(0);
Comparator<ByteBuffer> cmp = Literal.of(original).comparator();

Assert.assertEquals("Truncating to a length of zero should return an empty ByteBuffer",
0, cmp.compare(truncateBinary(original, 0), emptyByteBuffer));
Assert.assertEquals("Truncating to the original buffer's remaining size should return the original buffer",
original, truncateBinary(original, original.remaining()));
Assert.assertEquals("Truncating with a length greater than the input's remaining size should return the input",
original, truncateBinary(original, 16));
ByteBuffer truncated = truncateBinary(original, 2);
Assert.assertTrue("Truncating with a length less than the input's remaining size should truncate properly",
truncated.remaining() == 2 && truncated.position() == 0);
Assert.assertTrue("Truncating should not modify the input buffer",
original.remaining() == 4 && original.position() == 0);
AssertHelpers.assertThrows("Should not allow a negative truncation length",
IllegalArgumentException.class, "length should be non-negative",
() -> truncateBinary(original, -1));
}

@Test
public void testTruncateBinaryMin() {
ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ private List<Record> testRecords(Schema schema) {
return Lists.newArrayList(
record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"),
record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"),
record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"),
record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""),
record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"),
record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"),
record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ private List<Record> testRecords(Schema schema) {
return Lists.newArrayList(
record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"),
record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"),
record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), "forrest"),
record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can make a separate test if we feel that would be better. However, this test suite covers a very large number of cases so it seems like a smart place to add it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think that it's a good opportunity to start introducing more edge case data into the tests without adding any overhead, and it tests this code path in a myriad of ways and does not affect the original test (though I'll have to align my open PR that currently touches this file, but that's not a problem at all).

Copy link
Contributor Author

@kbendick kbendick Jan 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, for reference, I did make several separate tests and they all passed. But it just seems cleaner to me to introduce more common edge cases into the existing test cases, especially when it doesn't require us to change the existing tests. I'm surprised that the tests that run on random data hasn't caught this before. But most of the random data tests use a predefined seed, so I'm not that surprised.

That said, the tests in TestFilteredScan all go over the code paths that deal with this. And after we merge this, I can add in two small / inexpensive tests in the NOT_STARTS_WITH pr that will more explicitly cover this edge case - by partitioning a DF using a filter of data like '%' and data not like '%', which would throw an exception due to the truncation length without this change (though hopefully there aren't too many queries like the first one, but it's not my place to tell people how to write queries).

If there is any desire for a more explicit test or anything, please let me know 🙂

Copy link
Contributor Author

@kbendick kbendick Jan 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I actually found an inexpensive existing place to test truncateBinary itself. So the function will be better documented by means of the tests (if we agree that this change on the preconditions is acceptable).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine adding the test here, but I'd rather not change the existing cases. Would adding a new record test this without affecting the existing ones?

Copy link
Contributor Author

@kbendick kbendick Jan 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes unfortunately, adding a new record to be returned from this function would mean we'd have to change many of the existing tests, as many of them have a hardcoded assumption on 10 records.

I can add a new test in another (possibly new) file if we'd prefer. But in order to place the test in here to be run against what seems to be one of the more comprehensive test suites for hitting various evaluators in various scenarios (i.e. it tests against tables partitioned by the empty string record, tables not partitioned by the empty string record, unpartitioned tables), we'd have to either change an existing record or update the tests to be aware of 11 records.

In particular, we need to test on a scan of a table with a filter, which will cause the various evaluators which can return ROWS_MIGHT_MATCH.

I'll take another look to see if there's an existing test suite where a new test can be easily added to cover this scenario without affecting other tests.

Copy link
Contributor Author

@kbendick kbendick Jan 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I add the following test to this file, it does cover some of the codepaths (albeit with a somewhat funny SQL query). This will still go over the truncateBinary function when the input needs to be truncated to a length of zero, although this time it will be due to the predicate literal having a length of zero instead of the input data having a row with a length of zero. Either way, we truncate to the minimum length of either the input field or the predicate literal when using STARTS_WITH.

The following test would throw prior to this patch (and it doesn't require us to touch any of the input data for the other test suites).

  @Test
  public void testPartitionedByDataStartsWithEmptyStringFilter() {
    File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

    DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
        "path", location.toString())
    );

    IcebergSource source = new IcebergSource();
    DataSourceReader reader = source.createReader(options);
    pushFilters(reader, new StringStartsWith("data", ""));

    Assert.assertEquals(10, reader.planInputPartitions().size());
  }

Alternatively, I could add a new test file entirely and have full control over what the test does.

Copy link
Contributor Author

@kbendick kbendick Jan 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a test that does not change any of the exising test data or add a new file at all (only extends existings), in TestInclusiveMetricsEvaluator. https://github.com/apache/iceberg/pull/2081/files#diff-a107d871a1a5d5c4029451db68cf5497ddc9432d90619e263270cf72c3536c23R73

I personally prefer changing the input data on the larger end to end test that covers many portions (TestFilteredScan) as they truly cover the situation where the bug was originally reported occurring (and where it is likely to happen based on usages of truncateBinary) - filtered scans with empty row data.

But I've added another possible way to test the reported scenario -TestInclusiveMetricsEvaluator.

If you'd like me to add a standalone spark test or something, let me know. Or if you prefer the new test, let me know and I can revert the change to the input records of the other test suite.

Thanks for your review @rdblue 🙂

record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"),
record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"),
record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"),
Expand Down