Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ public void setup()
procedures.add(new TableDataRewriteDistributedProcedure("system", "distributed_fun",
distributedArguments,
(session, transactionContext, procedureHandle, fragments) -> null,
(transactionContext, procedureHandle, fragments) -> {},
TestProcedureRegistry.TestProcedureContext::new));
(session, transactionContext, procedureHandle, fragments) -> {},
ignored -> new TestProcedureRegistry.TestProcedureContext()));
procedureRegistry.addProcedures(new ConnectorId("test"), procedures);
queryPreparer = new BuiltInQueryPreparer(SQL_PARSER, procedureRegistry);
}
Expand Down
41 changes: 41 additions & 0 deletions presto-docs/src/main/sphinx/connector/iceberg.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1237,6 +1237,47 @@ Examples:

CALL iceberg.system.set_table_property('schema_name', 'table_name', 'commit.retry.num-retries', '10');

Rewrite Data Files
^^^^^^^^^^^^^^^^^^

Iceberg tracks all data files under different partition specs in a table. More data files require
more metadata to be stored in manifest files, and small data files can cause an unnecessary amount of metadata and
less efficient queries due to file open costs. Also, data files under different partition specs can
prevent metadata level deletion or thorough predicate push down for Presto.

Use ``rewrite_data_files`` to rewrite the data files of a specified table so that they are
merged into fewer but larger files under the newest partition spec. If the table is partitioned, the data
files compaction can act separately on the selected partitions to improve read performance by reducing
metadata overhead and runtime file open cost.

The following arguments are available:

===================== ========== =============== =======================================================================
Argument Name required type Description
===================== ========== =============== =======================================================================
``schema`` ✔️ string Schema of the table to update.

``table_name`` ✔️ string Name of the table to update.

``filter`` string Predicate as a string used for filtering the files. Currently
only rewrite of whole partitions is supported. Filter on partition
columns. The default value is `true`.

``options`` map Options to be used for data files rewrite. (to be expanded)
===================== ========== =============== =======================================================================

Examples:

* Rewrite all the data files in table `db.sample` to the newest partition spec and combine small files to larger ones::

CALL iceberg.system.rewrite_data_files('db', 'sample');
CALL iceberg.system.rewrite_data_files(schema => 'db', table_name => 'sample');

* Rewrite the data files in partitions specified by a filter in table `db.sample` to the newest partition spec::

CALL iceberg.system.rewrite_data_files('db', 'sample', 'partition_key = 1');
CALL iceberg.system.rewrite_data_files(schema => 'db', table_name => 'sample', filter => 'partition_key = 1');

Presto C++ Support
^^^^^^^^^^^^^^^^^^

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import com.facebook.airlift.json.JsonCodec;
import com.facebook.airlift.log.Logger;
import com.facebook.presto.common.QualifiedObjectName;
import com.facebook.presto.common.RuntimeStats;
import com.facebook.presto.common.Subfield;
import com.facebook.presto.common.predicate.TupleDomain;
Expand All @@ -35,6 +36,8 @@
import com.facebook.presto.spi.ColumnHandle;
import com.facebook.presto.spi.ColumnMetadata;
import com.facebook.presto.spi.ConnectorDeleteTableHandle;
import com.facebook.presto.spi.ConnectorDistributedProcedureHandle;
import com.facebook.presto.spi.ConnectorId;
import com.facebook.presto.spi.ConnectorInsertTableHandle;
import com.facebook.presto.spi.ConnectorNewTableLayout;
import com.facebook.presto.spi.ConnectorOutputTableHandle;
Expand Down Expand Up @@ -62,6 +65,9 @@
import com.facebook.presto.spi.connector.ConnectorTableVersion.VersionType;
import com.facebook.presto.spi.function.StandardFunctionResolution;
import com.facebook.presto.spi.plan.FilterStatsCalculatorService;
import com.facebook.presto.spi.procedure.BaseProcedure;
import com.facebook.presto.spi.procedure.DistributedProcedure;
import com.facebook.presto.spi.procedure.ProcedureRegistry;
import com.facebook.presto.spi.relation.RowExpression;
import com.facebook.presto.spi.relation.RowExpressionService;
import com.facebook.presto.spi.security.ViewSecurity;
Expand Down Expand Up @@ -249,12 +255,14 @@ public abstract class IcebergAbstractMetadata
protected static final int CURRENT_MATERIALIZED_VIEW_FORMAT_VERSION = 1;

protected final TypeManager typeManager;
protected final ProcedureRegistry procedureRegistry;
protected final JsonCodec<CommitTaskData> commitTaskCodec;
protected final JsonCodec<List<ColumnMapping>> columnMappingsCodec;
protected final JsonCodec<List<SchemaTableName>> schemaTableNamesCodec;
protected final NodeVersion nodeVersion;
protected final RowExpressionService rowExpressionService;
protected final FilterStatsCalculatorService filterStatsCalculatorService;
protected Optional<IcebergProcedureContext> procedureContext = Optional.empty();
protected Transaction transaction;
protected final StatisticsFileCache statisticsFileCache;
protected final IcebergTableProperties tableProperties;
Expand All @@ -264,6 +272,7 @@ public abstract class IcebergAbstractMetadata

public IcebergAbstractMetadata(
TypeManager typeManager,
ProcedureRegistry procedureRegistry,
StandardFunctionResolution functionResolution,
RowExpressionService rowExpressionService,
JsonCodec<CommitTaskData> commitTaskCodec,
Expand All @@ -275,6 +284,7 @@ public IcebergAbstractMetadata(
IcebergTableProperties tableProperties)
{
this.typeManager = requireNonNull(typeManager, "typeManager is null");
this.procedureRegistry = requireNonNull(procedureRegistry, "procedureRegistry is null");
this.commitTaskCodec = requireNonNull(commitTaskCodec, "commitTaskCodec is null");
this.columnMappingsCodec = requireNonNull(columnMappingsCodec, "columnMappingsCodec is null");
this.schemaTableNamesCodec = requireNonNull(schemaTableNamesCodec, "schemaTableNamesCodec is null");
Expand Down Expand Up @@ -317,6 +327,11 @@ protected abstract void updateIcebergViewProperties(

public abstract void unregisterTable(ConnectorSession clientSession, SchemaTableName schemaTableName);

public Optional<IcebergProcedureContext> getProcedureContext()
{
return this.procedureContext;
}

/**
* This class implements the default implementation for getTableLayoutForConstraint which will be used in the case of a Java Worker
*/
Expand All @@ -327,15 +342,17 @@ public ConnectorTableLayoutResult getTableLayoutForConstraint(
Constraint<ColumnHandle> constraint,
Optional<Set<ColumnHandle>> desiredColumns)
{
Map<String, IcebergColumnHandle> predicateColumns = constraint.getSummary().getDomains().get().keySet().stream()
.map(IcebergColumnHandle.class::cast)
.collect(toImmutableMap(IcebergColumnHandle::getName, Functions.identity()));
Map<String, IcebergColumnHandle> predicateColumns = constraint.getSummary().getDomains()
.map(domains -> domains.keySet().stream()
.map(IcebergColumnHandle.class::cast)
.collect(toImmutableMap(IcebergColumnHandle::getName, Functions.identity())))
.orElse(ImmutableMap.of());

IcebergTableHandle handle = (IcebergTableHandle) table;
Table icebergTable = getIcebergTable(session, handle.getSchemaTableName());

List<IcebergColumnHandle> partitionColumns = getPartitionKeyColumnHandles(handle, icebergTable, typeManager);
TupleDomain<ColumnHandle> partitionColumnPredicate = TupleDomain.withColumnDomains(Maps.filterKeys(constraint.getSummary().getDomains().get(), Predicates.in(partitionColumns)));
TupleDomain<ColumnHandle> partitionColumnPredicate = TupleDomain.withColumnDomains(Maps.filterKeys(constraint.getSummary().getDomains().orElse(ImmutableMap.of()), Predicates.in(partitionColumns)));
Optional<Set<IcebergColumnHandle>> requestedColumns = desiredColumns.map(columns -> columns.stream().map(column -> (IcebergColumnHandle) column).collect(toImmutableSet()));

List<HivePartition> partitions;
Expand Down Expand Up @@ -1117,6 +1134,46 @@ public void truncateTable(ConnectorSession session, ConnectorTableHandle tableHa
removeScanFiles(icebergTable, TupleDomain.all());
}

@Override
public ConnectorDistributedProcedureHandle beginCallDistributedProcedure(
ConnectorSession session,
QualifiedObjectName procedureName,
ConnectorTableLayoutHandle tableLayoutHandle,
Object[] arguments)
{
IcebergTableHandle handle = ((IcebergTableLayoutHandle) tableLayoutHandle).getTable();
Table icebergTable = getIcebergTable(session, handle.getSchemaTableName());

if (handle.isSnapshotSpecified()) {
throw new PrestoException(NOT_SUPPORTED, "This connector do not allow table execute at specified snapshot");
}

transaction = icebergTable.newTransaction();
BaseProcedure<?> procedure = procedureRegistry.resolve(
new ConnectorId(procedureName.getCatalogName()),
new SchemaTableName(
procedureName.getSchemaName(),
procedureName.getObjectName()));
verify(procedure instanceof DistributedProcedure, "procedure must be DistributedProcedure");
procedureContext = Optional.of((IcebergProcedureContext) ((DistributedProcedure) procedure).createContext(icebergTable, transaction));
return ((DistributedProcedure) procedure).begin(session, procedureContext.get(), tableLayoutHandle, arguments);
}

@Override
public void finishCallDistributedProcedure(ConnectorSession session, ConnectorDistributedProcedureHandle procedureHandle, QualifiedObjectName procedureName, Collection<Slice> fragments)
{
BaseProcedure<?> procedure = procedureRegistry.resolve(
new ConnectorId(procedureName.getCatalogName()),
new SchemaTableName(
procedureName.getSchemaName(),
procedureName.getObjectName()));
verify(procedure instanceof DistributedProcedure, "procedure must be DistributedProcedure");
verify(procedureContext.isPresent(), "procedure context must be present");
((DistributedProcedure) procedure).finish(session, procedureContext.get(), procedureHandle, fragments);
transaction.commitTransaction();
procedureContext = Optional.empty();
}

@Override
public ConnectorDeleteTableHandle beginDelete(ConnectorSession session, ConnectorTableHandle tableHandle)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import com.facebook.presto.iceberg.procedure.ManifestFileCacheInvalidationProcedure;
import com.facebook.presto.iceberg.procedure.RegisterTableProcedure;
import com.facebook.presto.iceberg.procedure.RemoveOrphanFiles;
import com.facebook.presto.iceberg.procedure.RewriteDataFilesProcedure;
import com.facebook.presto.iceberg.procedure.RollbackToSnapshotProcedure;
import com.facebook.presto.iceberg.procedure.RollbackToTimestampProcedure;
import com.facebook.presto.iceberg.procedure.SetCurrentSnapshotProcedure;
Expand Down Expand Up @@ -190,6 +191,7 @@ protected void setup(Binder binder)
procedures.addBinding().toProvider(SetTablePropertyProcedure.class).in(Scopes.SINGLETON);
procedures.addBinding().toProvider(StatisticsFileCacheInvalidationProcedure.class).in(Scopes.SINGLETON);
procedures.addBinding().toProvider(ManifestFileCacheInvalidationProcedure.class).in(Scopes.SINGLETON);
procedures.addBinding().toProvider(RewriteDataFilesProcedure.class).in(Scopes.SINGLETON);

// for orc
binder.bind(EncryptionLibrary.class).annotatedWith(HiveDwrfEncryptionProvider.ForCryptoService.class).to(UnsupportedEncryptionLibrary.class).in(Scopes.SINGLETON);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.iceberg;

import com.facebook.presto.hive.HiveCompressionCodec;
import com.facebook.presto.spi.ConnectorDistributedProcedureHandle;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.ImmutableList;

import java.util.List;
import java.util.Map;

import static java.util.Objects.requireNonNull;

public class IcebergDistributedProcedureHandle
extends IcebergWritableTableHandle
implements ConnectorDistributedProcedureHandle
{
private final IcebergTableLayoutHandle tableLayoutHandle;
private final Map<String, String> relevantData;

@JsonCreator
public IcebergDistributedProcedureHandle(
@JsonProperty("schemaName") String schemaName,
@JsonProperty("tableName") IcebergTableName tableName,
@JsonProperty("schema") PrestoIcebergSchema schema,
@JsonProperty("partitionSpec") PrestoIcebergPartitionSpec partitionSpec,
@JsonProperty("inputColumns") List<IcebergColumnHandle> inputColumns,
@JsonProperty("outputPath") String outputPath,
@JsonProperty("fileFormat") FileFormat fileFormat,
@JsonProperty("compressionCodec") HiveCompressionCodec compressionCodec,
@JsonProperty("storageProperties") Map<String, String> storageProperties,
@JsonProperty("tableLayoutHandle") IcebergTableLayoutHandle tableLayoutHandle,
@JsonProperty("relevantData") Map<String, String> relevantData)
{
super(
schemaName,
tableName,
schema,
partitionSpec,
inputColumns,
outputPath,
fileFormat,
compressionCodec,
storageProperties,
ImmutableList.of());
this.tableLayoutHandle = requireNonNull(tableLayoutHandle, "tableLayoutHandle is null");
this.relevantData = requireNonNull(relevantData, "relevantData is null");
}

@JsonProperty
public IcebergTableLayoutHandle getTableLayoutHandle()
{
return tableLayoutHandle;
}

@JsonProperty
public Map<String, String> getRelevantData()
{
return relevantData;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import com.facebook.presto.hive.HiveTransactionHandle;
import com.facebook.presto.spi.ColumnHandle;
import com.facebook.presto.spi.ConnectorDeleteTableHandle;
import com.facebook.presto.spi.ConnectorDistributedProcedureHandle;
import com.facebook.presto.spi.ConnectorHandleResolver;
import com.facebook.presto.spi.ConnectorInsertTableHandle;
import com.facebook.presto.spi.ConnectorOutputTableHandle;
Expand Down Expand Up @@ -69,6 +70,12 @@ public Class<? extends ConnectorDeleteTableHandle> getDeleteTableHandleClass()
return IcebergTableHandle.class;
}

@Override
public Class<? extends ConnectorDistributedProcedureHandle> getDistributedProcedureHandleClass()
{
return IcebergDistributedProcedureHandle.class;
}

@Override
public Class<? extends ConnectorTransactionHandle> getTransactionHandleClass()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import com.facebook.presto.spi.ViewNotFoundException;
import com.facebook.presto.spi.function.StandardFunctionResolution;
import com.facebook.presto.spi.plan.FilterStatsCalculatorService;
import com.facebook.presto.spi.procedure.ProcedureRegistry;
import com.facebook.presto.spi.relation.RowExpressionService;
import com.facebook.presto.spi.security.PrestoPrincipal;
import com.facebook.presto.spi.statistics.ColumnStatisticMetadata;
Expand Down Expand Up @@ -177,6 +178,7 @@ public IcebergHiveMetadata(
ExtendedHiveMetastore metastore,
HdfsEnvironment hdfsEnvironment,
TypeManager typeManager,
ProcedureRegistry procedureRegistry,
StandardFunctionResolution functionResolution,
RowExpressionService rowExpressionService,
JsonCodec<CommitTaskData> commitTaskCodec,
Expand All @@ -190,7 +192,8 @@ public IcebergHiveMetadata(
IcebergTableProperties tableProperties,
ConnectorSystemConfig connectorSystemConfig)
{
super(typeManager, functionResolution, rowExpressionService, commitTaskCodec, columnMappingsCodec, schemaTableNamesCodec, nodeVersion, filterStatsCalculatorService, statisticsFileCache, tableProperties);
super(typeManager, procedureRegistry, functionResolution, rowExpressionService, commitTaskCodec, columnMappingsCodec, schemaTableNamesCodec,
nodeVersion, filterStatsCalculatorService, statisticsFileCache, tableProperties);
this.catalogName = requireNonNull(catalogName, "catalogName is null");
this.metastore = requireNonNull(metastore, "metastore is null");
this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import com.facebook.presto.spi.connector.ConnectorMetadata;
import com.facebook.presto.spi.function.StandardFunctionResolution;
import com.facebook.presto.spi.plan.FilterStatsCalculatorService;
import com.facebook.presto.spi.procedure.ProcedureRegistry;
import com.facebook.presto.spi.relation.RowExpressionService;
import jakarta.inject.Inject;

Expand All @@ -39,6 +40,7 @@ public class IcebergHiveMetadataFactory
final ExtendedHiveMetastore metastore;
final HdfsEnvironment hdfsEnvironment;
final TypeManager typeManager;
final ProcedureRegistry procedureRegistry;
final JsonCodec<CommitTaskData> commitTaskCodec;
final JsonCodec<List<ColumnMapping>> columnMappingsCodec;
final JsonCodec<List<SchemaTableName>> schemaTableNamesCodec;
Expand All @@ -58,6 +60,7 @@ public IcebergHiveMetadataFactory(
ExtendedHiveMetastore metastore,
HdfsEnvironment hdfsEnvironment,
TypeManager typeManager,
ProcedureRegistry procedureRegistry,
StandardFunctionResolution functionResolution,
RowExpressionService rowExpressionService,
JsonCodec<CommitTaskData> commitTaskCodec,
Expand All @@ -75,6 +78,7 @@ public IcebergHiveMetadataFactory(
this.metastore = requireNonNull(metastore, "metastore is null");
this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
this.typeManager = requireNonNull(typeManager, "typeManager is null");
this.procedureRegistry = requireNonNull(procedureRegistry, "procedureRegistry is null");
this.functionResolution = requireNonNull(functionResolution, "functionResolution is null");
this.rowExpressionService = requireNonNull(rowExpressionService, "rowExpressionService is null");
this.commitTaskCodec = requireNonNull(commitTaskCodec, "commitTaskCodec is null");
Expand All @@ -96,6 +100,7 @@ public ConnectorMetadata create()
metastore,
hdfsEnvironment,
typeManager,
procedureRegistry,
functionResolution,
rowExpressionService,
commitTaskCodec,
Expand Down
Loading
Loading