Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@
import static io.trino.spi.StandardErrorCode.INVALID_TABLE_PROPERTY;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.StandardErrorCode.TABLE_NOT_FOUND;
import static io.trino.spi.connector.Constraint.alwaysTrue;
import static io.trino.spi.connector.RetryMode.NO_RETRIES;
import static io.trino.spi.predicate.TupleDomain.withColumnDomains;
import static io.trino.spi.statistics.TableStatisticType.ROW_COUNT;
Expand Down Expand Up @@ -485,7 +486,7 @@ public ConnectorTableHandle getTableHandleForStatisticsCollection(ConnectorSessi

handle = handle.withAnalyzePartitionValues(list);
HivePartitionResult partitions = partitionManager.getPartitions(handle, list);
handle = partitionManager.applyPartitionResult(handle, partitions, Optional.empty());
handle = partitionManager.applyPartitionResult(handle, partitions, alwaysTrue());
}

if (analyzeColumnNames.isPresent()) {
Expand Down Expand Up @@ -761,8 +762,14 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab
Map<String, Type> columnTypes = columns.entrySet().stream()
.collect(toImmutableMap(Map.Entry::getKey, entry -> getColumnMetadata(session, tableHandle, entry.getValue()).getType()));
HivePartitionResult partitionResult = partitionManager.getPartitions(metastore, tableHandle, constraint);
List<HivePartition> partitions = partitionManager.getPartitionsAsList(partitionResult);
return hiveStatisticsProvider.getTableStatistics(session, ((HiveTableHandle) tableHandle).getSchemaTableName(), columns, columnTypes, partitions);
// If partitions are not loaded, then don't generate table statistics.
// Note that the computation is not persisted in the table handle, so can be redone many times
// TODO: https://github.com/trinodb/trino/issues/10980.
Comment thread
Praveen2112 marked this conversation as resolved.
Outdated
if (partitionManager.canPartitionsBeLoaded(partitionResult)) {
List<HivePartition> partitions = partitionManager.getPartitionsAsList(partitionResult);
return hiveStatisticsProvider.getTableStatistics(session, ((HiveTableHandle) tableHandle).getSchemaTableName(), columns, columnTypes, partitions);
}
return TableStatistics.empty();
}

private List<SchemaTableName> listTables(ConnectorSession session, SchemaTablePrefix prefix)
Expand Down Expand Up @@ -2536,17 +2543,40 @@ public ConnectorTableProperties getTableProperties(ConnectorSession session, Con
HiveTableHandle hiveTable = (HiveTableHandle) table;

List<ColumnHandle> partitionColumns = ImmutableList.copyOf(hiveTable.getPartitionColumns());
List<HivePartition> partitions = partitionManager.getOrLoadPartitions(metastore, hiveTable);

TupleDomain<ColumnHandle> predicate = createPredicate(partitionColumns, partitions);

TupleDomain<ColumnHandle> predicate = TupleDomain.all();
Optional<DiscretePredicates> discretePredicates = Optional.empty();
if (!partitionColumns.isEmpty()) {
// Do not create tuple domains for every partition at the same time!
// There can be a huge number of partitions so use an iterable so
// all domains do not need to be in memory at the same time.
Iterable<TupleDomain<ColumnHandle>> partitionDomains = Iterables.transform(partitions, hivePartition -> TupleDomain.fromFixedValues(hivePartition.getKeys()));
discretePredicates = Optional.of(new DiscretePredicates(partitionColumns, partitionDomains));

// If only partition names are loaded, then the predicates are partially enforced.
// So computation of predicate and discretePredicates are not valid.
if (hiveTable.getPartitionNames().isEmpty()) {
Comment thread
Praveen2112 marked this conversation as resolved.
Outdated
Optional<List<HivePartition>> partitions = hiveTable.getPartitions()
// If the partitions are not loaded, try out if they can be loaded.
.or(() -> {
// We load the partitions to compute the predicates enforced by the table.
// Note that the computation is not persisted in the table handle, so can be redone many times
// TODO: https://github.com/trinodb/trino/issues/10980.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change worth doing until we have that done? Seems like loading the partition information once eagerly is better than lazily doing it multiple times

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexjo2144 not sure what's your suggestion here?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm just asking if this change to lazily load the partition information is actually an improvement until that linked issue is completed.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or if that issue is a blocker for this to be merged.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We lazily evaluate so that it will be loaded after all the filters are pushed to the Hive.

HivePartitionResult partitionResult = partitionManager.getPartitions(metastore, table, new Constraint(hiveTable.getEnforcedConstraint()));
if (partitionManager.canPartitionsBeLoaded(partitionResult)) {
return Optional.of(partitionManager.getPartitionsAsList(partitionResult));
}
return Optional.empty();
});

if (partitions.isPresent()) {
Comment thread
findepi marked this conversation as resolved.
Outdated
List<HivePartition> hivePartitions = partitions.orElseThrow();
// Since the partitions are fully loaded now, we need to compute
predicate = createPredicate(partitionColumns, hivePartitions);

// Un-partitioned tables can have a partition with ID - UNPARTITIONED,
// this check allows us to ensure that table is partitioned
if (!partitionColumns.isEmpty()) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can partitionColumns be empty here? The outer block of code looks like dealing with partitioned table

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of unpartitioned table too we might get a single entry for HivePartition, so we need this check.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for adding a comment.
as a followup please refactor this code. Doing some partition related work over 15 lines only to eventually check that... table isn't actually partitioned. We should reverse the checks

// Do not create tuple domains for every partition at the same time!
// There can be a huge number of partitions so use an iterable so
// all domains do not need to be in memory at the same time.
Iterable<TupleDomain<ColumnHandle>> partitionDomains = Iterables.transform(hivePartitions, hivePartition -> TupleDomain.fromFixedValues(hivePartition.getKeys()));
discretePredicates = Optional.of(new DiscretePredicates(partitionColumns, partitionDomains));
}
}
}

Optional<ConnectorTablePartitioning> tablePartitioning = Optional.empty();
Expand Down Expand Up @@ -2595,16 +2625,23 @@ public Optional<ConstraintApplicationResult<ConnectorTableHandle>> applyFilter(C
checkArgument(handle.getAnalyzePartitionValues().isEmpty() || constraint.getSummary().isAll(), "Analyze should not have a constraint");

HivePartitionResult partitionResult = partitionManager.getPartitions(metastore, handle, constraint);
HiveTableHandle newHandle = partitionManager.applyPartitionResult(handle, partitionResult, constraint.getPredicateColumns());
HiveTableHandle newHandle = partitionManager.applyPartitionResult(handle, partitionResult, constraint);

if (handle.getPartitions().equals(newHandle.getPartitions()) &&
Comment thread
findepi marked this conversation as resolved.
Outdated
handle.getPartitionNames().equals(newHandle.getPartitionNames()) &&
handle.getCompactEffectivePredicate().equals(newHandle.getCompactEffectivePredicate()) &&
handle.getBucketFilter().equals(newHandle.getBucketFilter()) &&
handle.getConstraintColumns().equals(newHandle.getConstraintColumns())) {
return Optional.empty();
}

return Optional.of(new ConstraintApplicationResult<>(newHandle, partitionResult.getUnenforcedConstraint(), false));
TupleDomain<ColumnHandle> unenforcedConstraint = partitionResult.getEffectivePredicate();
if (newHandle.getPartitions().isPresent()) {
List<HiveColumnHandle> partitionColumns = partitionResult.getPartitionColumns();
unenforcedConstraint = partitionResult.getEffectivePredicate().filter((column, domain) -> !partitionColumns.contains(column));
}

return Optional.of(new ConstraintApplicationResult<>(newHandle, unenforcedConstraint, false));
}

@Override
Expand Down Expand Up @@ -2838,6 +2875,7 @@ public ConnectorTableHandle makeCompatiblePartitioning(ConnectorSession session,
hiveTable.getTableParameters(),
hiveTable.getPartitionColumns(),
hiveTable.getDataColumns(),
hiveTable.getPartitionNames(),
hiveTable.getPartitions(),
hiveTable.getCompactEffectivePredicate(),
hiveTable.getEnforcedConstraint(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;

import static com.google.common.base.Preconditions.checkArgument;
Expand All @@ -48,7 +47,6 @@
import static io.trino.plugin.hive.metastore.MetastoreUtil.toPartitionName;
import static io.trino.plugin.hive.util.HiveBucketing.getHiveBucketFilter;
import static io.trino.plugin.hive.util.HiveUtil.parsePartitionValue;
import static io.trino.spi.predicate.TupleDomain.none;
import static java.lang.String.format;
import static java.util.stream.Collectors.toList;

Expand Down Expand Up @@ -86,7 +84,7 @@ public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastor
List<HiveColumnHandle> partitionColumns = hiveTableHandle.getPartitionColumns();

if (effectivePredicate.isNone()) {
return new HivePartitionResult(partitionColumns, ImmutableList.of(), none(), none(), none(), hiveBucketHandle, Optional.empty());
return new HivePartitionResult(partitionColumns, Optional.empty(), ImmutableList.of(), TupleDomain.none(), TupleDomain.none(), hiveBucketHandle, Optional.empty());
}

Optional<HiveBucketFilter> bucketFilter = getHiveBucketFilter(hiveTableHandle, effectivePredicate);
Expand All @@ -97,10 +95,10 @@ public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastor
if (partitionColumns.isEmpty()) {
return new HivePartitionResult(
partitionColumns,
Optional.empty(),
ImmutableList.of(new HivePartition(tableName)),
compactEffectivePredicate,
effectivePredicate,
TupleDomain.all(),
compactEffectivePredicate,
hiveBucketHandle,
bucketFilter);
}
Expand All @@ -109,6 +107,7 @@ public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastor
.map(HiveColumnHandle::getType)
.collect(toList());

Optional<List<String>> partitionNames = Optional.empty();
Iterable<HivePartition> partitionsIterable;
Predicate<Map<ColumnHandle, NullableValue>> predicate = constraint.predicate().orElse(value -> true);
if (hiveTableHandle.getPartitions().isPresent()) {
Expand All @@ -117,19 +116,18 @@ public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastor
.collect(toImmutableList());
}
else {
List<String> partitionNames = getFilteredPartitionNames(metastore, tableName, partitionColumns, compactEffectivePredicate);
partitionsIterable = () -> partitionNames.stream()
List<String> partitionNamesList = hiveTableHandle.getPartitionNames()
.orElseGet(() -> getFilteredPartitionNames(metastore, tableName, partitionColumns, compactEffectivePredicate));
partitionsIterable = () -> partitionNamesList.stream()
// Apply extra filters which could not be done by getFilteredPartitionNames
.map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns, partitionTypes, effectivePredicate, predicate))
.filter(Optional::isPresent)
.map(Optional::get)
.iterator();
partitionNames = Optional.of(partitionNamesList);
}

// All partition key domains will be fully evaluated, so we don't need to include those
TupleDomain<ColumnHandle> remainingTupleDomain = effectivePredicate.filter((column, domain) -> !partitionColumns.contains(column));
TupleDomain<ColumnHandle> enforcedTupleDomain = effectivePredicate.filter((column, domain) -> partitionColumns.contains(column));
return new HivePartitionResult(partitionColumns, partitionsIterable, compactEffectivePredicate, remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle, bucketFilter);
return new HivePartitionResult(partitionColumns, partitionNames, partitionsIterable, effectivePredicate, compactEffectivePredicate, hiveBucketHandle, bucketFilter);
}

public HivePartitionResult getPartitions(ConnectorTableHandle tableHandle, List<List<String>> partitionValuesList)
Expand All @@ -153,7 +151,7 @@ public HivePartitionResult getPartitions(ConnectorTableHandle tableHandle, List<
.map(partition -> partition.orElseThrow(() -> new VerifyException("partition must exist")))
.collect(toImmutableList());

return new HivePartitionResult(partitionColumns, partitionList, TupleDomain.all(), TupleDomain.all(), TupleDomain.all(), bucketHandle, Optional.empty());
return new HivePartitionResult(partitionColumns, Optional.empty(), partitionList, TupleDomain.all(), TupleDomain.all(), bucketHandle, Optional.empty());
}

public List<HivePartition> getPartitionsAsList(HivePartitionResult partitionResult)
Expand All @@ -175,22 +173,38 @@ public List<HivePartition> getPartitionsAsList(HivePartitionResult partitionResu
return partitionList.build();
}

public HiveTableHandle applyPartitionResult(HiveTableHandle handle, HivePartitionResult partitions, Optional<Set<ColumnHandle>> columns)
public HiveTableHandle applyPartitionResult(HiveTableHandle handle, HivePartitionResult partitions, Constraint constraint)
{
Optional<List<String>> partitionNames = partitions.getPartitionNames();
Comment thread
Praveen2112 marked this conversation as resolved.
Outdated
Optional<List<HivePartition>> partitionList = Optional.empty();
TupleDomain<ColumnHandle> enforcedConstraint = handle.getEnforcedConstraint();

// Partitions will be loaded if
// 1. Number of partitionNames is less than or equal to threshold value. Thereby generating additional filter criteria
// that can be applied on other join side (if the join is based on partition column),
// 2. If additional predicate is passed as a part of Constraint. (specified via loadPartition). This delays the partition checks
// until we have additional filtering based on Constraint
if (canPartitionsBeLoaded(partitions) || constraint.predicate().isPresent()) {
partitionNames = Optional.empty();
partitionList = Optional.of(getPartitionsAsList(partitions));
List<HiveColumnHandle> partitionColumns = partitions.getPartitionColumns();
enforcedConstraint = partitions.getEffectivePredicate().filter((column, domain) -> partitionColumns.contains(column));
}
return new HiveTableHandle(
handle.getSchemaName(),
handle.getTableName(),
handle.getTableParameters(),
ImmutableList.copyOf(partitions.getPartitionColumns()),
handle.getDataColumns(),
Optional.of(getPartitionsAsList(partitions)),
partitionNames,
partitionList,
partitions.getCompactEffectivePredicate(),
partitions.getEnforcedConstraint(),
enforcedConstraint,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we skip the if (canPartitionsBeLoaded(partitions) || constraint.predicate().isPresent()) block above (i.e. condition was false), then this variable contains partitions.getEffectivePredicate() which looks like not enforced and potentially containing Domains on non-partitioning columns (so something that won't be enforced)
this is here passed to HiveTableHandle#enforcedConstraint

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then this variable contains partitions.getEffectivePredicate()

Actually it would contain TupleDomain#all - but now it will be initialized to HiveTableHandle#enforcedConstraint

partitions.getBucketHandle(),
partitions.getBucketFilter(),
handle.getAnalyzePartitionValues(),
handle.getAnalyzeColumnNames(),
Sets.union(handle.getConstraintColumns(), columns.orElseGet(ImmutableSet::of)),
Sets.union(handle.getConstraintColumns(), constraint.getPredicateColumns().orElseGet(ImmutableSet::of)),
handle.getProjectedColumns(),
handle.getTransaction(),
handle.isRecordScannedFiles(),
Expand All @@ -199,8 +213,21 @@ public HiveTableHandle applyPartitionResult(HiveTableHandle handle, HivePartitio

public List<HivePartition> getOrLoadPartitions(SemiTransactionalHiveMetastore metastore, HiveTableHandle table)
{
// In case of partitions not being loaded, their permissible values are specified in `HiveTableHandle#getCompactEffectivePredicate,
// so we do an intersection of getCompactEffectivePredicate and HiveTable's enforced constraint
TupleDomain<ColumnHandle> summary = table.getEnforcedConstraint().intersect(
table.getCompactEffectivePredicate()
.transformKeys(ColumnHandle.class::cast));
Comment thread
findepi marked this conversation as resolved.
Outdated
return table.getPartitions().orElseGet(() ->
getPartitionsAsList(getPartitions(metastore, table, new Constraint(table.getEnforcedConstraint()))));
getPartitionsAsList(getPartitions(metastore, table, new Constraint(summary))));
}

public boolean canPartitionsBeLoaded(HivePartitionResult partitionResult)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some callers of getPartitions which don't then call this method before collecting the Partitions. getTableStatistics for example, should this be checked there?

{
if (partitionResult.getPartitionNames().isPresent()) {
return partitionResult.getPartitionNames().orElseThrow().size() <= maxPartitions;
}
return true;
}

private Optional<HivePartition> parseValuesAndFilterPartition(
Expand Down
Loading