Skip to content

Commit f823780

Browse files
committed
Avoid relying on row-group row count for detecting only-null domain
ColumnChunkMetaData#getValueCount should be used to get total values count for a column instead of BlockMetadata#getRowCount because single row may contain multiple values for a nested column type. Currently row group pruning is not implemented for nested columns. This change fixes the logic for only-nulls domain detection in preparation for nested columns row group pruning.
1 parent 8e4ff8f commit f823780

File tree

4 files changed

+78
-32
lines changed

4 files changed

+78
-32
lines changed

lib/trino-parquet/src/main/java/io/trino/parquet/predicate/PredicateUtils.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,12 @@ public static boolean predicateMatches(
139139
DateTimeZone timeZone)
140140
throws IOException
141141
{
142+
if (block.getRowCount() == 0) {
143+
return false;
144+
}
142145
Map<ColumnDescriptor, Statistics<?>> columnStatistics = getStatistics(block, descriptorsByPath);
143-
Optional<List<ColumnDescriptor>> candidateColumns = parquetPredicate.getIndexLookupCandidates(block.getRowCount(), columnStatistics, dataSource.getId());
146+
Map<ColumnDescriptor, Long> columnValueCounts = getColumnValueCounts(block, descriptorsByPath);
147+
Optional<List<ColumnDescriptor>> candidateColumns = parquetPredicate.getIndexLookupCandidates(columnValueCounts, columnStatistics, dataSource.getId());
144148
if (candidateColumns.isEmpty()) {
145149
return false;
146150
}
@@ -153,7 +157,7 @@ public static boolean predicateMatches(
153157
TupleDomainParquetPredicate indexPredicate = new TupleDomainParquetPredicate(parquetTupleDomain, candidateColumns.get(), timeZone);
154158

155159
// Page stats is finer grained but relatively more expensive, so we do the filtering after above block filtering.
156-
if (columnIndexStore.isPresent() && !indexPredicate.matches(block.getRowCount(), columnIndexStore.get(), dataSource.getId())) {
160+
if (columnIndexStore.isPresent() && !indexPredicate.matches(columnValueCounts, columnIndexStore.get(), dataSource.getId())) {
157161
return false;
158162
}
159163

@@ -181,6 +185,18 @@ private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData
181185
return statistics.buildOrThrow();
182186
}
183187

188+
private static Map<ColumnDescriptor, Long> getColumnValueCounts(BlockMetaData blockMetadata, Map<List<String>, ColumnDescriptor> descriptorsByPath)
189+
{
190+
ImmutableMap.Builder<ColumnDescriptor, Long> columnValueCounts = ImmutableMap.builder();
191+
for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
192+
ColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
193+
if (descriptor != null) {
194+
columnValueCounts.put(descriptor, columnMetaData.getValueCount());
195+
}
196+
}
197+
return columnValueCounts.buildOrThrow();
198+
}
199+
184200
private static boolean dictionaryPredicatesMatch(
185201
TupleDomainParquetPredicate parquetPredicate,
186202
BlockMetaData blockMetadata,

lib/trino-parquet/src/main/java/io/trino/parquet/predicate/TupleDomainParquetPredicate.java

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ public TupleDomainParquetPredicate(TupleDomain<ColumnDescriptor> effectivePredic
9595
* and if it should, then return the columns are candidates for further inspection of more
9696
* granular statistics from column index and dictionary.
9797
*
98-
* @param numberOfRows the number of rows in the segment; this can be used with
98+
* @param valueCounts the number of values for a column in the segment; this can be used with
9999
* Statistics to determine if a column is only null
100100
* @param statistics column statistics
101101
* @param id Parquet file name
@@ -105,12 +105,12 @@ public TupleDomainParquetPredicate(TupleDomain<ColumnDescriptor> effectivePredic
105105
* to potentially eliminate the file section. An optional with empty list is returned if there is
106106
* going to be no benefit in looking at column index or dictionary for any column.
107107
*/
108-
public Optional<List<ColumnDescriptor>> getIndexLookupCandidates(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics, ParquetDataSourceId id)
108+
public Optional<List<ColumnDescriptor>> getIndexLookupCandidates(
109+
Map<ColumnDescriptor, Long> valueCounts,
110+
Map<ColumnDescriptor, Statistics<?>> statistics,
111+
ParquetDataSourceId id)
109112
throws ParquetCorruptionException
110113
{
111-
if (numberOfRows == 0) {
112-
return Optional.empty();
113-
}
114114
if (effectivePredicate.isNone()) {
115115
return Optional.empty();
116116
}
@@ -131,10 +131,14 @@ public Optional<List<ColumnDescriptor>> getIndexLookupCandidates(long numberOfRo
131131
continue;
132132
}
133133

134+
Long columnValueCount = valueCounts.get(column);
135+
if (columnValueCount == null) {
136+
throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id));
137+
}
134138
Domain domain = getDomain(
135139
column,
136140
effectivePredicateDomain.getType(),
137-
numberOfRows,
141+
columnValueCount,
138142
columnStatistics,
139143
id,
140144
timeZone);
@@ -174,20 +178,15 @@ public boolean matches(DictionaryDescriptor dictionary)
174178
/**
175179
* Should the Parquet Reader process a file section with the specified statistics.
176180
*
177-
* @param numberOfRows the number of rows in the segment; this can be used with
181+
* @param valueCounts the number of values for a column in the segment; this can be used with
178182
* Statistics to determine if a column is only null
179183
* @param columnIndexStore column index (statistics) store
180184
* @param id Parquet file name
181185
*/
182-
public boolean matches(long numberOfRows, ColumnIndexStore columnIndexStore, ParquetDataSourceId id)
186+
public boolean matches(Map<ColumnDescriptor, Long> valueCounts, ColumnIndexStore columnIndexStore, ParquetDataSourceId id)
183187
throws ParquetCorruptionException
184188
{
185189
requireNonNull(columnIndexStore, "columnIndexStore is null");
186-
187-
if (numberOfRows == 0) {
188-
return false;
189-
}
190-
191190
if (effectivePredicate.isNone()) {
192191
return false;
193192
}
@@ -206,7 +205,11 @@ public boolean matches(long numberOfRows, ColumnIndexStore columnIndexStore, Par
206205
continue;
207206
}
208207

209-
Domain domain = getDomain(effectivePredicateDomain.getType(), numberOfRows, columnIndex, id, column, timeZone);
208+
Long columnValueCount = valueCounts.get(column);
209+
if (columnValueCount == null) {
210+
throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id));
211+
}
212+
Domain domain = getDomain(effectivePredicateDomain.getType(), columnValueCount, columnIndex, id, column, timeZone);
210213
if (!effectivePredicateDomain.overlaps(domain)) {
211214
return false;
212215
}
@@ -235,7 +238,7 @@ private boolean effectivePredicateMatches(Domain effectivePredicateDomain, Dicti
235238
public static Domain getDomain(
236239
ColumnDescriptor column,
237240
Type type,
238-
long rowCount,
241+
long columnValuesCount,
239242
Statistics<?> statistics,
240243
ParquetDataSourceId id,
241244
DateTimeZone timeZone)
@@ -245,7 +248,7 @@ public static Domain getDomain(
245248
return Domain.all(type);
246249
}
247250

248-
if (statistics.isNumNullsSet() && statistics.getNumNulls() == rowCount) {
251+
if (statistics.isNumNullsSet() && statistics.getNumNulls() == columnValuesCount) {
249252
return Domain.onlyNull(type);
250253
}
251254

@@ -437,7 +440,7 @@ private static Domain getDomain(
437440
@VisibleForTesting
438441
public static Domain getDomain(
439442
Type type,
440-
long rowCount,
443+
long columnValuesCount,
441444
ColumnIndex columnIndex,
442445
ParquetDataSourceId id,
443446
ColumnDescriptor descriptor,
@@ -466,7 +469,7 @@ public static Domain getDomain(
466469
.sum();
467470
boolean hasNullValue = totalNullCount > 0;
468471

469-
if (hasNullValue && totalNullCount == rowCount) {
472+
if (hasNullValue && totalNullCount == columnValuesCount) {
470473
return Domain.onlyNull(type);
471474
}
472475

lib/trino-parquet/src/test/java/io/trino/parquet/TestTupleDomainParquetPredicate.java

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,8 @@ public void testBigint()
154154

155155
assertEquals(getDomain(columnDescriptor, BIGINT, 10, longColumnStats(0L, 100L), ID, UTC), create(ValueSet.ofRanges(range(BIGINT, 0L, true, 100L, true)), false));
156156

157-
assertEquals(getDomain(columnDescriptor, BIGINT, 20, longOnlyNullsStats(10), ID, UTC), create(ValueSet.all(BIGINT), true));
157+
assertEquals(getDomain(columnDescriptor, BIGINT, 20, longOnlyNullsStats(10), ID, UTC), Domain.all(BIGINT));
158+
assertEquals(getDomain(columnDescriptor, BIGINT, 20, longOnlyNullsStats(20), ID, UTC), Domain.onlyNull(BIGINT));
158159
// fail on corrupted statistics
159160
assertThatExceptionOfType(ParquetCorruptionException.class)
160161
.isThrownBy(() -> getDomain(columnDescriptor, BIGINT, 10, longColumnStats(100L, 10L), ID, UTC))
@@ -555,7 +556,7 @@ public void testVarcharMatchesWithStatistics()
555556
.withMax(value.getBytes(UTF_8))
556557
.withNumNulls(1L)
557558
.build();
558-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(column, stats), ID))
559+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(column, 2L), ImmutableMap.of(column, stats), ID))
559560
.isEqualTo(Optional.of(ImmutableList.of(column)));
560561
}
561562

@@ -569,10 +570,10 @@ public void testIntegerMatchesWithStatistics(Type typeForParquetInt32)
569570
Domain.create(ValueSet.of(typeForParquetInt32, 42L, 43L, 44L, 112L), false)));
570571
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column), UTC);
571572

572-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(column, intColumnStats(32, 42)), ID))
573+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(column, 2L), ImmutableMap.of(column, intColumnStats(32, 42)), ID))
573574
.isEqualTo(Optional.of(ImmutableList.of(column)));
574-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(column, intColumnStats(30, 40)), ID)).isEmpty();
575-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(column, intColumnStats(1024, 0x10000 + 42)), ID).isPresent())
575+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(column, 2L), ImmutableMap.of(column, intColumnStats(30, 40)), ID)).isEmpty();
576+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(column, 2L), ImmutableMap.of(column, intColumnStats(1024, 0x10000 + 42)), ID).isPresent())
576577
.isEqualTo(typeForParquetInt32 != INTEGER); // stats invalid for smallint/tinyint
577578
}
578579

@@ -596,10 +597,10 @@ public void testBigintMatchesWithStatistics()
596597
Domain.create(ValueSet.of(BIGINT, 42L, 43L, 44L, 404L), false)));
597598
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column), UTC);
598599

599-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(column, longColumnStats(32, 42)), ID))
600+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(column, 2L), ImmutableMap.of(column, longColumnStats(32, 42)), ID))
600601
.isEqualTo(Optional.of(ImmutableList.of(column)));
601-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(column, longColumnStats(30, 40)), ID)).isEmpty();
602-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(column, longColumnStats(1024, 0x10000 + 42)), ID)).isEmpty();
602+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(column, 2L), ImmutableMap.of(column, longColumnStats(30, 40)), ID)).isEmpty();
603+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(column, 2L), ImmutableMap.of(column, longColumnStats(1024, 0x10000 + 42)), ID)).isEmpty();
603604
}
604605

605606
@Test
@@ -669,23 +670,23 @@ public void testIndexLookupCandidates()
669670

670671
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(columnA), UTC);
671672
assertThat(parquetPredicate.getIndexLookupCandidates(
672-
2,
673+
ImmutableMap.of(columnA, 2L, columnB, 2L),
673674
ImmutableMap.of(columnA, longColumnStats(32, 42), columnB, longColumnStats(42, 500)), ID))
674675
.isEqualTo(Optional.of(ImmutableList.of(columnA)));
675676

676677
parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, ImmutableList.of(columnA, columnB), UTC);
677678
// column stats missing on columnB
678-
assertThat(parquetPredicate.getIndexLookupCandidates(2, ImmutableMap.of(columnA, longColumnStats(32, 42)), ID))
679+
assertThat(parquetPredicate.getIndexLookupCandidates(ImmutableMap.of(columnA, 2L), ImmutableMap.of(columnA, longColumnStats(32, 42)), ID))
679680
.isEqualTo(Optional.of(ImmutableList.of(columnA, columnB)));
680681

681682
// All possible values for columnB are covered by effectivePredicate
682683
assertThat(parquetPredicate.getIndexLookupCandidates(
683-
2,
684+
ImmutableMap.of(columnA, 2L, columnB, 2L),
684685
ImmutableMap.of(columnA, longColumnStats(32, 42), columnB, longColumnStats(50, 400)), ID))
685686
.isEqualTo(Optional.of(ImmutableList.of(columnA)));
686687

687688
assertThat(parquetPredicate.getIndexLookupCandidates(
688-
2,
689+
ImmutableMap.of(columnA, 2L, columnB, 2L),
689690
ImmutableMap.of(columnA, longColumnStats(32, 42), columnB, longColumnStats(42, 500)), ID))
690691
.isEqualTo(Optional.of(ImmutableList.of(columnA, columnB)));
691692
}

plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5187,6 +5187,32 @@ private void testParquetDictionaryPredicatePushdown(Session session)
51875187
assertNoDataRead("SELECT * FROM " + tableName + " WHERE n = 3");
51885188
}
51895189

5190+
@Test
5191+
public void testParquetOnlyNullsRowGroupPruning()
5192+
{
5193+
String tableName = "test_primitive_column_nulls_pruning_" + randomNameSuffix();
5194+
assertUpdate("CREATE TABLE " + tableName + " (col BIGINT) WITH (format = 'PARQUET')");
5195+
assertUpdate("INSERT INTO " + tableName + " SELECT * FROM unnest(repeat(NULL, 4096))", 4096);
5196+
assertNoDataRead("SELECT * FROM " + tableName + " WHERE col IS NOT NULL");
5197+
5198+
tableName = "test_nested_column_nulls_pruning_" + randomNameSuffix();
5199+
// Nested column `a` has nulls count of 4096 and contains only nulls
5200+
// Nested column `b` also has nulls count of 4096, but it contains non nulls as well
5201+
assertUpdate("CREATE TABLE " + tableName + " (col ROW(a BIGINT, b ARRAY(DOUBLE))) WITH (format = 'PARQUET')");
5202+
assertUpdate("INSERT INTO " + tableName + " SELECT * FROM unnest(transform(repeat(1, 4096), x -> ROW(ROW(NULL, ARRAY [NULL, rand()]))))", 4096);
5203+
// TODO replace with assertNoDataRead after nested column predicate pushdown
5204+
assertQueryStats(
5205+
getSession(),
5206+
"SELECT * FROM " + tableName + " WHERE col.a IS NOT NULL",
5207+
queryStats -> assertThat(queryStats.getProcessedInputDataSize().toBytes()).isGreaterThan(0),
5208+
results -> assertThat(results.getRowCount()).isEqualTo(0));
5209+
assertQueryStats(
5210+
getSession(),
5211+
"SELECT * FROM " + tableName + " WHERE col.b IS NOT NULL",
5212+
queryStats -> assertThat(queryStats.getProcessedInputDataSize().toBytes()).isGreaterThan(0),
5213+
results -> assertThat(results.getRowCount()).isEqualTo(4096));
5214+
}
5215+
51905216
private void assertNoDataRead(@Language("SQL") String sql)
51915217
{
51925218
assertQueryStats(

0 commit comments

Comments
 (0)