Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> s
}

@Override
public boolean matches(Map<ColumnDescriptor, DictionaryDescriptor> dictionaries)
public boolean matches(DictionaryDescriptor dictionary)
{
return true;
}
Expand All @@ -51,9 +51,11 @@ boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statisti
throws ParquetCorruptionException;

/**
* Should the Parquet Reader process a file section with the specified dictionary.
* Should the Parquet Reader process a file section with the specified dictionary based on that
* single dictionary. This is safe to check repeatedly to avoid loading more parquet dictionaries
* if the section can already be eliminated.
*
* @param dictionaries dictionaries per column
* @param dictionary The single column dictionary
*/
boolean matches(Map<ColumnDescriptor, DictionaryDescriptor> dictionaries);
boolean matches(DictionaryDescriptor dictionary);
}
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData
return false;
}

Map<ColumnDescriptor, DictionaryDescriptor> dictionaries = getDictionaries(block, dataSource, descriptorsByPath, parquetTupleDomain);
return parquetPredicate.matches(dictionaries);
return dictionaryPredicatesMatch(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain);
}

private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
Expand All @@ -113,23 +112,22 @@ private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData
return statistics.build();
}

private static Map<ColumnDescriptor, DictionaryDescriptor> getDictionaries(BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain)
private static boolean dictionaryPredicatesMatch(Predicate parquetPredicate, BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain)
{
ImmutableMap.Builder<ColumnDescriptor, DictionaryDescriptor> dictionaries = ImmutableMap.builder();
for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
if (descriptor != null) {
if (isOnlyDictionaryEncodingPages(columnMetaData) && isColumnPredicate(descriptor, parquetTupleDomain)) {
int totalSize = toIntExact(columnMetaData.getTotalSize());
byte[] buffer = new byte[totalSize];
byte[] buffer = new byte[toIntExact(columnMetaData.getTotalSize())];
dataSource.readFully(columnMetaData.getStartingPos(), buffer);
Optional<DictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnMetaData.getCodec());
dictionaries.put(descriptor, new DictionaryDescriptor(descriptor, dictionaryPage));
break;
// Early abort, predicate already filters block so no more dictionaries need be read
if (!parquetPredicate.matches(new DictionaryDescriptor(descriptor, readDictionaryPage(buffer, columnMetaData.getCodec())))) {
return false;
}
}
}
}
return dictionaries.build();
return true;
}

private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName)
Expand Down Expand Up @@ -157,7 +155,7 @@ private static Optional<DictionaryPage> readDictionaryPage(byte[] data, Compress
private static boolean isColumnPredicate(ColumnDescriptor columnDescriptor, TupleDomain<ColumnDescriptor> parquetTupleDomain)
{
verify(parquetTupleDomain.getDomains().isPresent(), "parquetTupleDomain is empty");
return parquetTupleDomain.getDomains().get().keySet().contains(columnDescriptor);
return parquetTupleDomain.getDomains().get().containsKey(columnDescriptor);
}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,27 +106,24 @@ public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> s
}

@Override
public boolean matches(Map<ColumnDescriptor, DictionaryDescriptor> dictionaries)
public boolean matches(DictionaryDescriptor dictionary)
{
requireNonNull(dictionary, "dictionary is null");
if (effectivePredicate.isNone()) {
return false;
}

Map<ColumnDescriptor, Domain> effectivePredicateDomains = effectivePredicate.getDomains()
.orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));

for (RichColumnDescriptor column : columns) {
Domain effectivePredicateDomain = effectivePredicateDomains.get(column);
if (effectivePredicateDomain == null) {
continue;
}
DictionaryDescriptor dictionaryDescriptor = dictionaries.get(column);
Domain domain = getDomain(effectivePredicateDomain.getType(), dictionaryDescriptor);
if (effectivePredicateDomain.intersect(domain).isNone()) {
return false;
}
}
return true;
Domain effectivePredicateDomain = effectivePredicateDomains.get(dictionary.getColumnDescriptor());

return effectivePredicateDomain == null || effectivePredicateMatches(effectivePredicateDomain, dictionary);
}

private static boolean effectivePredicateMatches(Domain effectivePredicateDomain, DictionaryDescriptor dictionary)
{
return !effectivePredicateDomain.intersect(getDomain(effectivePredicateDomain.getType(), dictionary)).isNone();
}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ public void testVarcharMatchesWithDictionaryDescriptor()
TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), EMPTY_SLICE);
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
DictionaryPage page = new DictionaryPage(Slices.wrappedBuffer(new byte[] {0, 0, 0, 0}), 1, PLAIN_DICTIONARY);
assertTrue(parquetPredicate.matches(singletonMap(column, new DictionaryDescriptor(column, Optional.of(page)))));
assertTrue(parquetPredicate.matches(new DictionaryDescriptor(column, Optional.of(page))));
}

private TupleDomain<ColumnDescriptor> getEffectivePredicate(RichColumnDescriptor column, VarcharType type, Slice value)
Expand Down