diff --git a/core/trino-server/src/main/provisio/trino.xml b/core/trino-server/src/main/provisio/trino.xml index 9de46c96cf1d..cbab9902568f 100644 --- a/core/trino-server/src/main/provisio/trino.xml +++ b/core/trino-server/src/main/provisio/trino.xml @@ -74,6 +74,12 @@ + + + + + + diff --git a/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoFileSystemCache.java b/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoFileSystemCache.java index 7a493236ad2c..b36936b117da 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoFileSystemCache.java +++ b/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoFileSystemCache.java @@ -350,6 +350,12 @@ public FSDataInputStream open(Path f, int bufferSize) return new InputStreamWrapper(getRawFileSystem().open(f, bufferSize), this); } + @Override + public String getScheme() + { + return getRawFileSystem().getScheme(); + } + @Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java index 20b323b67f08..b96310e24f6b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java @@ -265,7 +265,7 @@ import static io.trino.plugin.hive.util.CompressionConfigUtil.configureCompression; import static io.trino.plugin.hive.util.HiveBucketing.getHiveBucketHandle; import static io.trino.plugin.hive.util.HiveBucketing.isSupportedBucketing; -import static io.trino.plugin.hive.util.HiveUtil.columnExtraInfo; +import static io.trino.plugin.hive.util.HiveUtil.columnMetadataGetter; import static io.trino.plugin.hive.util.HiveUtil.getPartitionKeyColumnHandles; import static io.trino.plugin.hive.util.HiveUtil.getRegularColumnHandles; import static io.trino.plugin.hive.util.HiveUtil.hiveColumnHandles; @@ -3536,40 +3536,6 @@ else if (type instanceof RowType) { } } - private Function columnMetadataGetter(Table table) - { - ImmutableList.Builder columnNames = ImmutableList.builder(); - table.getPartitionColumns().stream().map(Column::getName).forEach(columnNames::add); - table.getDataColumns().stream().map(Column::getName).forEach(columnNames::add); - List allColumnNames = columnNames.build(); - if (allColumnNames.size() > Sets.newHashSet(allColumnNames).size()) { - throw new TrinoException(HIVE_INVALID_METADATA, - format("Hive metadata for table %s is invalid: Table descriptor contains duplicate columns", table.getTableName())); - } - - List tableColumns = table.getDataColumns(); - ImmutableMap.Builder> builder = ImmutableMap.builder(); - for (Column field : concat(tableColumns, table.getPartitionColumns())) { - if (field.getComment().isPresent() && !field.getComment().get().equals("from deserializer")) { - builder.put(field.getName(), field.getComment()); - } - else { - builder.put(field.getName(), Optional.empty()); - } - } - - Map> columnComment = builder.buildOrThrow(); - - return handle -> ColumnMetadata.builder() - .setName(handle.getName()) - .setType(handle.getType()) - .setComment(handle.isHidden() ? Optional.empty() : columnComment.get(handle.getName())) - .setExtraInfo(Optional.ofNullable(columnExtraInfo(handle.isPartitionKey()))) - .setHidden(handle.isHidden()) - .setProperties(partitionProjectionService.getPartitionProjectionTrinoColumnProperties(table, handle.getName())) - .build(); - } - @Override public void rollback() { diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java index 7eb069ac9f6b..2f804be82a9b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java @@ -211,17 +211,7 @@ public static ReaderPageSource createPageSource( FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); fileSchema = fileMetaData.getSchema(); - Optional message = projectSufficientColumns(columns) - .map(projection -> projection.get().stream() - .map(HiveColumnHandle.class::cast) - .collect(toUnmodifiableList())) - .orElse(columns).stream() - .filter(column -> column.getColumnType() == REGULAR) - .map(column -> getColumnType(column, fileSchema, useColumnNames)) - .filter(Optional::isPresent) - .map(Optional::get) - .map(type -> new MessageType(fileSchema.getName(), type)) - .reduce(MessageType::union); + Optional message = getParquetMessageType(columns, useColumnNames, fileSchema); requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of())); messageColumn = getColumnIO(fileSchema, requestedSchema); @@ -301,6 +291,22 @@ && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parq } } + public static Optional getParquetMessageType(List columns, boolean useColumnNames, MessageType fileSchema) + { + Optional message = projectSufficientColumns(columns) + .map(projection -> projection.get().stream() + .map(HiveColumnHandle.class::cast) + .collect(toUnmodifiableList())) + .orElse(columns).stream() + .filter(column -> column.getColumnType() == REGULAR) + .map(column -> getColumnType(column, fileSchema, useColumnNames)) + .filter(Optional::isPresent) + .map(Optional::get) + .map(type -> new MessageType(fileSchema.getName(), type)) + .reduce(MessageType::union); + return message; + } + public static Optional getParquetType(GroupType groupType, boolean useParquetColumnNames, HiveColumnHandle column) { if (useParquetColumnNames) { @@ -341,7 +347,7 @@ public static Optional getColumnType(HiveColumnH return Optional.of(new GroupType(baseType.getRepetition(), baseType.getName(), ImmutableList.of(type))); } - private static Optional getColumnIndexStore( + public static Optional getColumnIndexStore( ParquetDataSource dataSource, BlockMetaData blockMetadata, Map, ColumnDescriptor> descriptorsByPath, @@ -416,7 +422,7 @@ public static TupleDomain getParquetTupleDomain( return TupleDomain.withColumnDomains(predicate.buildOrThrow()); } - private static org.apache.parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) + public static org.apache.parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) { if (useParquetColumnNames) { return getParquetTypeByName(column.getBaseColumnName(), messageType); @@ -428,7 +434,7 @@ private static org.apache.parquet.schema.Type getParquetType(HiveColumnHandle co return null; } - private static List createParquetReaderColumns(List baseColumns, MessageType fileSchema, MessageColumnIO messageColumn, boolean useColumnNames) + public static List createParquetReaderColumns(List baseColumns, MessageType fileSchema, MessageColumnIO messageColumn, boolean useColumnNames) { for (HiveColumnHandle column : baseColumns) { checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3/TrinoS3FileSystem.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3/TrinoS3FileSystem.java index ac5f232d297e..bcd5625fcc65 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3/TrinoS3FileSystem.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3/TrinoS3FileSystem.java @@ -361,6 +361,12 @@ private void closeSuper() super.close(); } + @Override + public String getScheme() + { + return uri.getScheme(); + } + @Override public URI getUri() { diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java index 38f11ad8f515..f43632babc68 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java @@ -17,7 +17,9 @@ import com.google.common.base.Splitter; import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; import io.airlift.compress.lzo.LzoCodec; import io.airlift.compress.lzo.LzopCodec; import io.airlift.slice.Slice; @@ -36,6 +38,7 @@ import io.trino.plugin.hive.metastore.Table; import io.trino.spi.ErrorCodeSupplier; import io.trino.spi.TrinoException; +import io.trino.spi.connector.ColumnMetadata; import io.trino.spi.predicate.NullableValue; import io.trino.spi.type.ArrayType; import io.trino.spi.type.CharType; @@ -96,12 +99,14 @@ import java.util.Optional; import java.util.OptionalInt; import java.util.Properties; +import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.Iterables.concat; import static com.google.common.collect.Lists.newArrayList; import static io.airlift.slice.Slices.utf8Slice; import static io.trino.hdfs.ConfigurationUtils.copy; @@ -1155,4 +1160,37 @@ public static boolean isSparkBucketedTable(Table table) return table.getParameters().containsKey(SPARK_TABLE_PROVIDER_KEY) && table.getParameters().containsKey(SPARK_TABLE_BUCKET_NUMBER_KEY); } + + public static Function columnMetadataGetter(Table table) + { + ImmutableList.Builder columnNames = ImmutableList.builder(); + table.getPartitionColumns().stream().map(Column::getName).forEach(columnNames::add); + table.getDataColumns().stream().map(Column::getName).forEach(columnNames::add); + List allColumnNames = columnNames.build(); + if (allColumnNames.size() > Sets.newHashSet(allColumnNames).size()) { + throw new TrinoException(HIVE_INVALID_METADATA, + format("Hive metadata for table %s is invalid: Table descriptor contains duplicate columns", table.getTableName())); + } + + List tableColumns = table.getDataColumns(); + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (Column field : concat(tableColumns, table.getPartitionColumns())) { + if (field.getComment().isPresent() && !field.getComment().get().equals("from deserializer")) { + builder.put(field.getName(), field.getComment()); + } + else { + builder.put(field.getName(), Optional.empty()); + } + } + + Map> columnComment = builder.buildOrThrow(); + + return handle -> ColumnMetadata.builder() + .setName(handle.getName()) + .setType(handle.getType()) + .setComment(handle.isHidden() ? Optional.empty() : columnComment.get(handle.getName())) + .setExtraInfo(Optional.ofNullable(columnExtraInfo(handle.isPartitionKey()))) + .setHidden(handle.isHidden()) + .build(); + } } diff --git a/plugin/trino-hudi/pom.xml b/plugin/trino-hudi/pom.xml new file mode 100644 index 000000000000..c3086fd5a57e --- /dev/null +++ b/plugin/trino-hudi/pom.xml @@ -0,0 +1,474 @@ + + + 4.0.0 + + + trino-root + io.trino + 398-SNAPSHOT + ../../pom.xml + + + trino-hudi + Trino - Hudi Connector + trino-plugin + + + ${project.parent.basedir} + 0.11.1 + + + + + io.trino + trino-filesystem + + + + io.trino + trino-hdfs + + + + io.trino + trino-hive + + + + io.trino + trino-memory-context + + + + io.trino + trino-parquet + + + + io.trino + trino-plugin-toolkit + + + + io.trino.hadoop + hadoop-apache + + + + io.trino.hive + hive-apache + + + + io.airlift + bootstrap + + + + io.airlift + concurrent + + + + io.airlift + configuration + + + + io.airlift + event + + + + io.airlift + json + + + + io.airlift + log + + + + io.airlift + units + + + + com.google.code.findbugs + jsr305 + true + + + + com.google.guava + guava + + + + com.google.inject + guice + + + + javax.annotation + javax.annotation-api + + + + javax.inject + javax.inject + + + + javax.validation + validation-api + + + + joda-time + joda-time + + + + org.apache.hudi + hudi-common + ${dep.hudi.version} + + + org.apache.hbase + hbase-server + + + org.apache.hbase + hbase-client + + + org.apache.orc + orc-core + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + fluent-hc + + + org.rocksdb + rocksdbjni + + + com.esotericsoftware + kryo-shaded + + + org.apache.hadoop + hadoop-client + + + org.apache.hadoop + hadoop-hdfs + + + org.apache.httpcomponents + httpcore + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-jdbc + + + com.github.ben-manes.caffeine + caffeine + + + org.lz4 + lz4-java + + + + + + org.apache.hudi + hudi-hadoop-mr + ${dep.hudi.version} + + + * + * + + + + + + org.weakref + jmxutils + + + + + io.trino + trino-hadoop-toolkit + runtime + + + + io.airlift + log-manager + runtime + + + + + io.trino + trino-spi + provided + + + + io.airlift + slice + provided + + + + com.fasterxml.jackson.core + jackson-annotations + provided + + + + org.openjdk.jol + jol-core + provided + + + + + io.trino + trino-hive + test-jar + test + + + + io.trino + trino-hive-hadoop2 + test + + + + io.trino + trino-main + test + + + + io.trino + trino-main + test-jar + test + + + + io.trino + trino-parser + test + + + + io.trino + trino-spi + test-jar + test + + + + io.trino + trino-testing + test + + + + io.trino + trino-testing-containers + test + + + + io.trino + trino-testing-services + test + + + + io.trino + trino-tpch + test + + + + io.trino.tpch + tpch + test + + + + io.airlift + testing + test + + + + org.apache.hudi + hudi-client-common + ${dep.hudi.version} + test + + + com.beust + jcommander + + + commons-logging + commons-logging + + + log4j + log4j + + + io.dropwizard.metrics + metrics-core + + + org.apache.curator + curator-framework + + + org.apache.hudi + hudi-common + + + org.apache.hudi + hudi-hive-sync + + + org.apache.hudi + hudi-timeline-service + + + org.apache.hive + hive-service + + + org.apache.parquet + parquet-avro + + + org.apache.curator + curator-client + + + org.apache.curator + curator-recipes + + + com.github.davidmoten + hilbert-curve + + + io.prometheus + * + + + io.dropwizard.metrics + * + + + + + + org.apache.hudi + hudi-java-client + ${dep.hudi.version} + test + + + org.apache.hudi + * + + + org.apache.parquet + parquet-avro + + + + + + org.assertj + assertj-core + test + + + + org.jetbrains + annotations + test + + + + org.testng + testng + test + + + + + + + org.basepom.maven + duplicate-finder-maven-plugin + + + + mime.types + about.html + + log4j.properties + log4j-surefire.properties + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + --add-opens=java.base/java.lang=ALL-UNNAMED + --add-opens=java.base/java.util=ALL-UNNAMED + --add-opens=java.base/java.util.concurrent=ALL-UNNAMED + + + + + + + diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/ForHudiSplitManager.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/ForHudiSplitManager.java new file mode 100644 index 000000000000..648f7f31fc7c --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/ForHudiSplitManager.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForHudiSplitManager {} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java new file mode 100644 index 000000000000..ea323818bdf6 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java @@ -0,0 +1,194 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import io.airlift.units.DataSize; + +import javax.validation.constraints.DecimalMax; +import javax.validation.constraints.DecimalMin; +import javax.validation.constraints.Max; +import javax.validation.constraints.Min; +import javax.validation.constraints.NotNull; + +import java.util.List; + +import static com.google.common.base.Strings.nullToEmpty; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static java.util.Locale.ENGLISH; + +public class HudiConfig +{ + private static final Splitter COMMA_SPLITTER = Splitter.on(",").omitEmptyStrings().trimResults(); + + private List columnsToHide = ImmutableList.of(); + private boolean metadataEnabled; + private boolean shouldUseParquetColumnNames = true; + private int minPartitionBatchSize = 10; + private int maxPartitionBatchSize = 100; + private boolean sizeBasedSplitWeightsEnabled = true; + private DataSize standardSplitWeightSize = DataSize.of(128, MEGABYTE); + private double minimumAssignedSplitWeight = 0.05; + private int maxSplitsPerSecond = Integer.MAX_VALUE; + private int maxOutstandingSplits = 1000; + + public List getColumnsToHide() + { + return columnsToHide; + } + + @Config("hudi.columns-to-hide") + @ConfigDescription("List of column names that will be hidden from the query output. " + + "It can be used to hide Hudi meta fields. By default, no fields are hidden.") + public HudiConfig setColumnsToHide(String columnsToHide) + { + this.columnsToHide = COMMA_SPLITTER.splitToStream(nullToEmpty(columnsToHide)) + .map(s -> s.toLowerCase(ENGLISH)) + .collect(toImmutableList()); + return this; + } + + @Config("hudi.metadata-enabled") + @ConfigDescription("Fetch the list of file names and sizes from metadata rather than storage.") + public HudiConfig setMetadataEnabled(boolean metadataEnabled) + { + this.metadataEnabled = metadataEnabled; + return this; + } + + public boolean isMetadataEnabled() + { + return this.metadataEnabled; + } + + @Config("hudi.parquet.use-column-names") + @ConfigDescription("Access Parquet columns using names from the file. If disabled, then columns are accessed using index." + + "Only applicable to Parquet file format.") + public HudiConfig setUseParquetColumnNames(boolean shouldUseParquetColumnNames) + { + this.shouldUseParquetColumnNames = shouldUseParquetColumnNames; + return this; + } + + public boolean getUseParquetColumnNames() + { + return this.shouldUseParquetColumnNames; + } + + @Config("hudi.min-partition-batch-size") + @ConfigDescription("Minimum number of partitions returned in a single batch.") + public HudiConfig setMinPartitionBatchSize(int minPartitionBatchSize) + { + this.minPartitionBatchSize = minPartitionBatchSize; + return this; + } + + @Min(1) + @Max(100) + public int getMinPartitionBatchSize() + { + return minPartitionBatchSize; + } + + @Config("hudi.max-partition-batch-size") + @ConfigDescription("Maximum number of partitions returned in a single batch.") + public HudiConfig setMaxPartitionBatchSize(int maxPartitionBatchSize) + { + this.maxPartitionBatchSize = maxPartitionBatchSize; + return this; + } + + @Min(1) + @Max(1000) + public int getMaxPartitionBatchSize() + { + return maxPartitionBatchSize; + } + + @Config("hudi.size-based-split-weights-enabled") + @ConfigDescription("Unlike uniform splitting, size-based splitting ensures that each batch of splits has enough data to process. " + + "By default, it is enabled to improve performance.") + public HudiConfig setSizeBasedSplitWeightsEnabled(boolean sizeBasedSplitWeightsEnabled) + { + this.sizeBasedSplitWeightsEnabled = sizeBasedSplitWeightsEnabled; + return this; + } + + public boolean isSizeBasedSplitWeightsEnabled() + { + return sizeBasedSplitWeightsEnabled; + } + + @Config("hudi.standard-split-weight-size") + @ConfigDescription("The split size corresponding to the standard weight (1.0) " + + "when size based split weights are enabled.") + public HudiConfig setStandardSplitWeightSize(DataSize standardSplitWeightSize) + { + this.standardSplitWeightSize = standardSplitWeightSize; + return this; + } + + @NotNull + public DataSize getStandardSplitWeightSize() + { + return standardSplitWeightSize; + } + + @Config("hudi.minimum-assigned-split-weight") + @ConfigDescription("Minimum weight that a split can be assigned when size based split weights are enabled.") + public HudiConfig setMinimumAssignedSplitWeight(double minimumAssignedSplitWeight) + { + this.minimumAssignedSplitWeight = minimumAssignedSplitWeight; + return this; + } + + @DecimalMax("1") + @DecimalMin(value = "0", inclusive = false) + public double getMinimumAssignedSplitWeight() + { + return minimumAssignedSplitWeight; + } + + @Min(1) + public int getMaxSplitsPerSecond() + { + return maxSplitsPerSecond; + } + + @Config("hudi.max-splits-per-second") + @ConfigDescription("Rate at which splits are enqueued for processing. The queue will throttle if this rate limit is breached.") + public HudiConfig setMaxSplitsPerSecond(int maxSplitsPerSecond) + { + this.maxSplitsPerSecond = maxSplitsPerSecond; + return this; + } + + @Min(1) + public int getMaxOutstandingSplits() + { + return maxOutstandingSplits; + } + + @Config("hudi.max-outstanding-splits") + @ConfigDescription("Maximum outstanding splits in a batch enqueued for processing.") + public HudiConfig setMaxOutstandingSplits(int maxOutstandingSplits) + { + this.maxOutstandingSplits = maxOutstandingSplits; + return this; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnector.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnector.java new file mode 100644 index 000000000000..a4da3a1eae4f --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnector.java @@ -0,0 +1,147 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.airlift.bootstrap.LifeCycleManager; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorMetadata; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.plugin.hive.HiveTransactionHandle; +import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorMetadata; +import io.trino.spi.connector.ConnectorNodePartitioningProvider; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplitManager; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.session.PropertyMetadata; +import io.trino.spi.transaction.IsolationLevel; + +import java.util.List; +import java.util.Set; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.spi.transaction.IsolationLevel.SERIALIZABLE; +import static io.trino.spi.transaction.IsolationLevel.checkConnectorSupports; +import static java.util.Objects.requireNonNull; + +public class HudiConnector + implements Connector +{ + private final LifeCycleManager lifeCycleManager; + private final HudiTransactionManager transactionManager; + private final ConnectorSplitManager splitManager; + private final ConnectorPageSourceProvider pageSourceProvider; + private final ConnectorNodePartitioningProvider nodePartitioningProvider; + private final Set systemTables; + private final List> sessionProperties; + private final List> tableProperties; + + public HudiConnector( + LifeCycleManager lifeCycleManager, + HudiTransactionManager transactionManager, + ConnectorSplitManager splitManager, + ConnectorPageSourceProvider pageSourceProvider, + ConnectorNodePartitioningProvider nodePartitioningProvider, + Set systemTables, + Set sessionPropertiesProviders, + List> tableProperties) + { + this.lifeCycleManager = requireNonNull(lifeCycleManager, "lifeCycleManager is null"); + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.splitManager = requireNonNull(splitManager, "splitManager is null"); + this.pageSourceProvider = requireNonNull(pageSourceProvider, "pageSourceProvider is null"); + this.nodePartitioningProvider = requireNonNull(nodePartitioningProvider, "nodePartitioningProvider is null"); + this.systemTables = ImmutableSet.copyOf(requireNonNull(systemTables, "systemTables is null")); + this.sessionProperties = requireNonNull(sessionPropertiesProviders, "sessionPropertiesProviders is null").stream() + .flatMap(sessionPropertiesProvider -> sessionPropertiesProvider.getSessionProperties().stream()) + .collect(toImmutableList()); + this.tableProperties = ImmutableList.copyOf(requireNonNull(tableProperties, "tableProperties is null")); + } + + @Override + public ConnectorMetadata getMetadata(ConnectorSession session, ConnectorTransactionHandle transactionHandle) + { + ConnectorMetadata metadata = transactionManager.get(transactionHandle, session.getIdentity()); + return new ClassLoaderSafeConnectorMetadata(metadata, getClass().getClassLoader()); + } + + @Override + public ConnectorSplitManager getSplitManager() + { + return splitManager; + } + + @Override + public ConnectorPageSourceProvider getPageSourceProvider() + { + return pageSourceProvider; + } + + @Override + public ConnectorNodePartitioningProvider getNodePartitioningProvider() + { + return nodePartitioningProvider; + } + + @Override + public Set getSystemTables() + { + return systemTables; + } + + @Override + public List> getSessionProperties() + { + return sessionProperties; + } + + @Override + public List> getTableProperties() + { + return tableProperties; + } + + @Override + public ConnectorTransactionHandle beginTransaction(IsolationLevel isolationLevel, boolean readOnly, boolean autoCommit) + { + checkConnectorSupports(SERIALIZABLE, isolationLevel); + ConnectorTransactionHandle transaction = new HiveTransactionHandle(true); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + transactionManager.put(transaction); + } + return transaction; + } + + @Override + public void commit(ConnectorTransactionHandle transaction) + { + transactionManager.commit(transaction); + } + + @Override + public void rollback(ConnectorTransactionHandle transaction) + { + transactionManager.rollback(transaction); + } + + @Override + public final void shutdown() + { + lifeCycleManager.stop(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnectorFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnectorFactory.java new file mode 100644 index 000000000000..71133de3f492 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConnectorFactory.java @@ -0,0 +1,59 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorContext; +import io.trino.spi.connector.ConnectorFactory; + +import java.lang.reflect.InvocationTargetException; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.base.Throwables.throwIfUnchecked; +import static io.trino.plugin.base.Versions.checkSpiVersion; + +public class HudiConnectorFactory + implements ConnectorFactory +{ + public HudiConnectorFactory() + {} + + @Override + public String getName() + { + return "hudi"; + } + + @Override + public Connector create(String catalogName, Map config, ConnectorContext context) + { + checkSpiVersion(context, this); + + ClassLoader classLoader = context.duplicatePluginClassLoader(); + try { + return (Connector) classLoader.loadClass(InternalHudiConnectorFactory.class.getName()) + .getMethod("createConnector", String.class, Map.class, ConnectorContext.class, Optional.class) + .invoke(null, catalogName, config, context, Optional.empty()); + } + catch (InvocationTargetException e) { + Throwable targetException = e.getTargetException(); + throwIfUnchecked(targetException); + throw new RuntimeException(targetException); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiErrorCode.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiErrorCode.java new file mode 100644 index 000000000000..4d5686823665 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiErrorCode.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import io.trino.spi.ErrorCode; +import io.trino.spi.ErrorCodeSupplier; +import io.trino.spi.ErrorType; + +import static io.trino.spi.ErrorType.EXTERNAL; + +public enum HudiErrorCode + implements ErrorCodeSupplier +{ + HUDI_UNKNOWN_TABLE_TYPE(0, EXTERNAL), + HUDI_INVALID_PARTITION_VALUE(1, EXTERNAL), + HUDI_BAD_DATA(2, EXTERNAL), + HUDI_MISSING_DATA(3, EXTERNAL), + HUDI_CANNOT_OPEN_SPLIT(4, EXTERNAL), + HUDI_UNSUPPORTED_FILE_FORMAT(5, EXTERNAL), + HUDI_CURSOR_ERROR(6, EXTERNAL); + + private final ErrorCode errorCode; + + HudiErrorCode(int code, ErrorType type) + { + errorCode = new ErrorCode(code + 0x0507_0000, name(), type); + } + + @Override + public ErrorCode toErrorCode() + { + return errorCode; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiInputInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiInputInfo.java new file mode 100644 index 000000000000..9af9d7a92906 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiInputInfo.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static java.util.Objects.requireNonNull; + +public class HudiInputInfo +{ + private final List partitionIds; + + @JsonCreator + public HudiInputInfo(@JsonProperty("partitionIds") List partitionIds) + { + this.partitionIds = ImmutableList.copyOf(requireNonNull(partitionIds, "partitionIds is null")); + } + + @JsonProperty + public List getPartitionIds() + { + return partitionIds; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadata.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadata.java new file mode 100644 index 000000000000..7e191e7e0624 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadata.java @@ -0,0 +1,256 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.log.Logger; +import io.trino.hdfs.HdfsContext; +import io.trino.hdfs.HdfsEnvironment; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorMetadata; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.Constraint; +import io.trino.spi.connector.ConstraintApplicationResult; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SchemaTablePrefix; +import io.trino.spi.connector.TableColumnsMetadata; +import io.trino.spi.connector.TableNotFoundException; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieTableType; + +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; +import static io.trino.plugin.hive.HiveTimestampPrecision.NANOSECONDS; +import static io.trino.plugin.hive.util.HiveUtil.columnMetadataGetter; +import static io.trino.plugin.hive.util.HiveUtil.hiveColumnHandles; +import static io.trino.plugin.hive.util.HiveUtil.isHiveSystemSchema; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_UNKNOWN_TABLE_TYPE; +import static io.trino.plugin.hudi.HudiSessionProperties.getColumnsToHide; +import static io.trino.plugin.hudi.HudiTableProperties.LOCATION_PROPERTY; +import static io.trino.plugin.hudi.HudiTableProperties.PARTITIONED_BY_PROPERTY; +import static io.trino.spi.connector.SchemaTableName.schemaTableName; +import static java.lang.String.format; +import static java.util.Collections.singletonList; +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; +import static org.apache.hudi.common.fs.FSUtils.getFs; +import static org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME; +import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; +import static org.apache.hudi.exception.TableNotFoundException.checkTableValidity; + +public class HudiMetadata + implements ConnectorMetadata +{ + public static final Logger log = Logger.get(HudiMetadata.class); + + private final HiveMetastore metastore; + private final HdfsEnvironment hdfsEnvironment; + private final TypeManager typeManager; + + public HudiMetadata(HiveMetastore metastore, HdfsEnvironment hdfsEnvironment, TypeManager typeManager) + { + this.metastore = requireNonNull(metastore, "metastore is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + @Override + public List listSchemaNames(ConnectorSession session) + { + return metastore.getAllDatabases(); + } + + @Override + public HudiTableHandle getTableHandle(ConnectorSession session, SchemaTableName tableName) + { + if (isHiveSystemSchema(tableName.getSchemaName())) { + return null; + } + Optional table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName()); + if (table.isEmpty()) { + return null; + } + if (!isHudiTable(session, table.get())) { + throw new TrinoException(HUDI_UNKNOWN_TABLE_TYPE, format("Not a Hudi table: %s", tableName)); + } + return new HudiTableHandle( + tableName.getSchemaName(), + tableName.getTableName(), + table.get().getStorage().getLocation(), + HoodieTableType.COPY_ON_WRITE, + TupleDomain.all(), + TupleDomain.all()); + } + + @Override + public ConnectorTableMetadata getTableMetadata(ConnectorSession session, ConnectorTableHandle table) + { + HudiTableHandle hudiTableHandle = (HudiTableHandle) table; + return getTableMetadata(hudiTableHandle.getSchemaTableName(), getColumnsToHide(session)); + } + + @Override + public Optional> applyFilter(ConnectorSession session, ConnectorTableHandle tableHandle, Constraint constraint) + { + HudiTableHandle handle = (HudiTableHandle) tableHandle; + HudiPredicates predicates = HudiPredicates.from(constraint.getSummary()); + HudiTableHandle newHudiTableHandle = handle.applyPredicates( + predicates.getPartitionColumnPredicates(), + predicates.getRegularColumnPredicates()); + + if (handle.getPartitionPredicates().equals(newHudiTableHandle.getPartitionPredicates()) + && handle.getRegularPredicates().equals(newHudiTableHandle.getRegularPredicates())) { + return Optional.empty(); + } + + return Optional.of(new ConstraintApplicationResult<>( + newHudiTableHandle, + newHudiTableHandle.getRegularPredicates().transformKeys(ColumnHandle.class::cast), + false)); + } + + @Override + public Map getColumnHandles(ConnectorSession session, ConnectorTableHandle tableHandle) + { + HudiTableHandle hudiTableHandle = (HudiTableHandle) tableHandle; + Table table = metastore.getTable(hudiTableHandle.getSchemaName(), hudiTableHandle.getTableName()) + .orElseThrow(() -> new TableNotFoundException(schemaTableName(hudiTableHandle.getSchemaName(), hudiTableHandle.getTableName()))); + return hiveColumnHandles(table, typeManager, NANOSECONDS).stream() + .collect(toImmutableMap(HiveColumnHandle::getName, identity())); + } + + @Override + public ColumnMetadata getColumnMetadata(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnHandle columnHandle) + { + return ((HiveColumnHandle) columnHandle).getColumnMetadata(); + } + + @Override + public Optional getInfo(ConnectorTableHandle table) + { + return Optional.of(HudiTableInfo.from((HudiTableHandle) table)); + } + + @Override + public List listTables(ConnectorSession session, Optional optionalSchemaName) + { + ImmutableList.Builder tableNames = ImmutableList.builder(); + for (String schemaName : listSchemas(session, optionalSchemaName)) { + for (String tableName : metastore.getAllTables(schemaName)) { + tableNames.add(new SchemaTableName(schemaName, tableName)); + } + } + return tableNames.build(); + } + + @Override + public Iterator streamTableColumns(ConnectorSession session, SchemaTablePrefix prefix) + { + List tables = prefix.getTable() + .map(ignored -> singletonList(prefix.toSchemaTableName())) + .orElseGet(() -> listTables(session, prefix.getSchema())); + return tables.stream() + .map(table -> getTableColumnMetadata(session, table)) + .flatMap(Optional::stream) + .iterator(); + } + + HiveMetastore getMetastore() + { + return metastore; + } + + private boolean isHudiTable(ConnectorSession session, Table table) + { + String basePath = table.getStorage().getLocation(); + Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), new Path(basePath)); + try { + checkTableValidity(getFs(basePath, configuration), new Path(basePath), new Path(basePath, METAFOLDER_NAME)); + } + catch (org.apache.hudi.exception.TableNotFoundException e) { + log.warn("Could not find Hudi table at path '%s'", basePath); + return false; + } + return true; + } + + private Optional getTableColumnMetadata(ConnectorSession session, SchemaTableName table) + { + try { + List columns = getTableMetadata(table, getColumnsToHide(session)).getColumns(); + return Optional.of(TableColumnsMetadata.forTable(table, columns)); + } + catch (TableNotFoundException ignored) { + return Optional.empty(); + } + } + + private ConnectorTableMetadata getTableMetadata(SchemaTableName tableName, Collection columnsToHide) + { + Table table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + Function metadataGetter = columnMetadataGetter(table); + List columns = hiveColumnHandles(table, typeManager, NANOSECONDS).stream() + .filter(column -> !columnsToHide.contains(column.getName())) + .map(metadataGetter) + .collect(toImmutableList()); + + ImmutableMap.Builder properties = ImmutableMap.builder(); + // Location property + String location = table.getStorage().getLocation(); + if (!isNullOrEmpty(location)) { + properties.put(LOCATION_PROPERTY, location); + } + + // Partitioning property + List partitionedBy = table.getPartitionColumns().stream() + .map(Column::getName) + .collect(toImmutableList()); + if (!partitionedBy.isEmpty()) { + properties.put(PARTITIONED_BY_PROPERTY, partitionedBy); + } + + Optional comment = Optional.ofNullable(table.getParameters().get(TABLE_COMMENT)); + return new ConnectorTableMetadata(tableName, columns, properties.buildOrThrow(), comment); + } + + private List listSchemas(ConnectorSession session, Optional schemaName) + { + return schemaName + .filter(name -> !isHiveSystemSchema(name)) + .map(Collections::singletonList) + .orElseGet(() -> listSchemaNames(session)); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadataFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadataFactory.java new file mode 100644 index 000000000000..bb1a4832bc9e --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiMetadataFactory.java @@ -0,0 +1,47 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import io.trino.hdfs.HdfsEnvironment; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.HiveMetastoreFactory; +import io.trino.spi.security.ConnectorIdentity; +import io.trino.spi.type.TypeManager; + +import javax.inject.Inject; + +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public class HudiMetadataFactory +{ + private final HiveMetastoreFactory metastoreFactory; + private final HdfsEnvironment hdfsEnvironment; + private final TypeManager typeManager; + + @Inject + public HudiMetadataFactory(HiveMetastoreFactory metastoreFactory, HdfsEnvironment hdfsEnvironment, TypeManager typeManager) + { + this.metastoreFactory = requireNonNull(metastoreFactory, "metastore is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + public HudiMetadata create(ConnectorIdentity identity) + { + HiveMetastore metastore = metastoreFactory.createMetastore(Optional.of(identity)); + return new HudiMetadata(metastore, hdfsEnvironment, typeManager); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiModule.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiModule.java new file mode 100644 index 000000000000..52bd1f7028c2 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiModule.java @@ -0,0 +1,89 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.inject.Binder; +import com.google.inject.Key; +import com.google.inject.Module; +import com.google.inject.Provides; +import com.google.inject.Scopes; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HiveNodePartitioningProvider; +import io.trino.plugin.hive.HiveTransactionHandle; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.HiveMetastoreConfig; +import io.trino.plugin.hive.metastore.thrift.TranslateHiveViews; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hive.parquet.ParquetWriterConfig; +import io.trino.spi.connector.ConnectorNodePartitioningProvider; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSplitManager; +import io.trino.spi.security.ConnectorIdentity; + +import javax.inject.Singleton; + +import java.util.concurrent.ExecutorService; +import java.util.function.BiFunction; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static org.weakref.jmx.guice.ExportBinder.newExporter; + +public class HudiModule + implements Module +{ + @Override + public void configure(Binder binder) + { + binder.bind(HudiTransactionManager.class).in(Scopes.SINGLETON); + + configBinder(binder).bindConfig(HudiConfig.class); + configBinder(binder).bindConfig(HiveMetastoreConfig.class); + binder.bind(Key.get(boolean.class, TranslateHiveViews.class)).toInstance(false); + + newSetBinder(binder, SessionPropertiesProvider.class).addBinding().to(HudiSessionProperties.class).in(Scopes.SINGLETON); + binder.bind(HudiTableProperties.class).in(Scopes.SINGLETON); + + binder.bind(ConnectorSplitManager.class).to(HudiSplitManager.class).in(Scopes.SINGLETON); + binder.bind(ConnectorPageSourceProvider.class).to(HudiPageSourceProvider.class).in(Scopes.SINGLETON); + binder.bind(ConnectorNodePartitioningProvider.class).to(HiveNodePartitioningProvider.class).in(Scopes.SINGLETON); + + configBinder(binder).bindConfig(ParquetReaderConfig.class); + configBinder(binder).bindConfig(ParquetWriterConfig.class); + + binder.bind(HudiMetadataFactory.class).in(Scopes.SINGLETON); + + binder.bind(FileFormatDataSourceStats.class).in(Scopes.SINGLETON); + newExporter(binder).export(FileFormatDataSourceStats.class).withGeneratedName(); + } + + @ForHudiSplitManager + @Singleton + @Provides + public ExecutorService createExecutorService() + { + return newCachedThreadPool(daemonThreadsNamed("hudi-split-manager-%d")); + } + + @Singleton + @Provides + public BiFunction createHiveMetastoreGetter(HudiTransactionManager transactionManager) + { + return (identity, transactionHandle) -> + transactionManager.get(transactionHandle, identity).getMetastore(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSource.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSource.java new file mode 100644 index 000000000000..22f141c7fdf9 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSource.java @@ -0,0 +1,111 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.spi.Page; +import io.trino.spi.block.Block; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.connector.ConnectorPageSource; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static io.trino.plugin.base.util.Closables.closeAllSuppress; +import static java.util.Objects.requireNonNull; + +public class HudiPageSource + implements ConnectorPageSource +{ + private final List columnHandles; + private final ConnectorPageSource dataPageSource; + private final Map partitionBlocks; + + public HudiPageSource( + List columnHandles, + Map partitionBlocks, + ConnectorPageSource dataPageSource) + { + this.columnHandles = ImmutableList.copyOf(requireNonNull(columnHandles, "columnHandles is null")); + this.partitionBlocks = ImmutableMap.copyOf(requireNonNull(partitionBlocks, "partitionBlocks is null")); + this.dataPageSource = requireNonNull(dataPageSource, "dataPageSource is null"); + } + + @Override + public long getCompletedBytes() + { + return dataPageSource.getCompletedBytes(); + } + + @Override + public long getReadTimeNanos() + { + return dataPageSource.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return dataPageSource.isFinished(); + } + + @Override + public Page getNextPage() + { + try { + Page page = dataPageSource.getNextPage(); + if (page == null) { + return null; + } + int positionCount = page.getPositionCount(); + + int dataColumnIndex = 0; + int columnIndex = 0; + Block[] blocksWithPartitionColumns = new Block[columnHandles.size()]; + for (HiveColumnHandle columnHandle : columnHandles) { + if (columnHandle.isPartitionKey()) { + Block partitionValue = partitionBlocks.get(columnHandle.getName()); + blocksWithPartitionColumns[columnIndex] = RunLengthEncodedBlock.create(partitionValue, positionCount); + } + else { + blocksWithPartitionColumns[columnIndex] = page.getBlock(dataColumnIndex); + dataColumnIndex++; + } + + columnIndex++; + } + return new Page(positionCount, blocksWithPartitionColumns); + } + catch (RuntimeException e) { + closeAllSuppress(e, this); + throw e; + } + } + + @Override + public long getMemoryUsage() + { + return dataPageSource.getMemoryUsage(); + } + + @Override + public void close() + throws IOException + { + dataPageSource.close(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java new file mode 100644 index 000000000000..3ba24507e56b --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java @@ -0,0 +1,347 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.filesystem.TrinoInputFile; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.predicate.Predicate; +import io.trino.parquet.reader.MetadataReader; +import io.trino.parquet.reader.ParquetReader; +import io.trino.parquet.reader.ParquetReaderColumn; +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.ReaderColumns; +import io.trino.plugin.hive.parquet.ParquetPageSource; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hive.parquet.TrinoParquetDataSource; +import io.trino.spi.TrinoException; +import io.trino.spi.block.Block; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.DynamicFilter; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.Decimals; +import io.trino.spi.type.TypeSignature; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.BlockMissingException; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.schema.MessageType; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.io.IOException; +import java.sql.Timestamp; +import java.time.LocalDate; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.TimeZone; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.slice.Slices.utf8Slice; +import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.trino.parquet.ParquetTypeUtils.getColumnIO; +import static io.trino.parquet.ParquetTypeUtils.getDescriptors; +import static io.trino.parquet.predicate.PredicateUtils.buildPredicate; +import static io.trino.parquet.predicate.PredicateUtils.predicateMatches; +import static io.trino.parquet.reader.ParquetReaderColumn.getParquetReaderFields; +import static io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.createParquetReaderColumns; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getColumnIndexStore; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getParquetMessageType; +import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getParquetTupleDomain; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_BAD_DATA; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_CANNOT_OPEN_SPLIT; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_CURSOR_ERROR; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_INVALID_PARTITION_VALUE; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_MISSING_DATA; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_UNSUPPORTED_FILE_FORMAT; +import static io.trino.plugin.hudi.HudiSessionProperties.shouldUseParquetColumnNames; +import static io.trino.plugin.hudi.HudiUtil.getHudiFileFormat; +import static io.trino.spi.predicate.Utils.nativeValueToBlock; +import static io.trino.spi.type.StandardTypes.BIGINT; +import static io.trino.spi.type.StandardTypes.BOOLEAN; +import static io.trino.spi.type.StandardTypes.DATE; +import static io.trino.spi.type.StandardTypes.DECIMAL; +import static io.trino.spi.type.StandardTypes.DOUBLE; +import static io.trino.spi.type.StandardTypes.INTEGER; +import static io.trino.spi.type.StandardTypes.REAL; +import static io.trino.spi.type.StandardTypes.SMALLINT; +import static io.trino.spi.type.StandardTypes.TIMESTAMP; +import static io.trino.spi.type.StandardTypes.TINYINT; +import static io.trino.spi.type.StandardTypes.VARBINARY; +import static io.trino.spi.type.StandardTypes.VARCHAR; +import static java.lang.Double.parseDouble; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Float.parseFloat; +import static java.lang.Long.parseLong; +import static java.lang.String.format; +import static java.util.Objects.isNull; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toMap; +import static java.util.stream.Collectors.toUnmodifiableList; + +public class HudiPageSourceProvider + implements ConnectorPageSourceProvider +{ + private final TrinoFileSystemFactory fileSystemFactory; + private final FileFormatDataSourceStats dataSourceStats; + private final ParquetReaderOptions options; + private final DateTimeZone timeZone; + + @Inject + public HudiPageSourceProvider( + TrinoFileSystemFactory fileSystemFactory, + FileFormatDataSourceStats dataSourceStats, + ParquetReaderConfig parquetReaderConfig) + { + this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.dataSourceStats = requireNonNull(dataSourceStats, "dataSourceStats is null"); + this.options = requireNonNull(parquetReaderConfig, "parquetReaderConfig is null").toParquetReaderOptions(); + this.timeZone = DateTimeZone.forID(TimeZone.getDefault().getID()); + } + + @Override + public ConnectorPageSource createPageSource( + ConnectorTransactionHandle transaction, + ConnectorSession session, + ConnectorSplit connectorSplit, + ConnectorTableHandle connectorTable, + List columns, + DynamicFilter dynamicFilter) + { + HudiSplit split = (HudiSplit) connectorSplit; + Path path = new Path(split.getPath()); + HoodieFileFormat hudiFileFormat = getHudiFileFormat(path.toString()); + if (!HoodieFileFormat.PARQUET.equals(hudiFileFormat)) { + throw new TrinoException(HUDI_UNSUPPORTED_FILE_FORMAT, format("File format %s not supported", hudiFileFormat)); + } + + List hiveColumns = columns.stream() + .map(HiveColumnHandle.class::cast) + .collect(toList()); + // just send regular columns to create parquet page source + // for partition columns, separate blocks will be created + List regularColumns = hiveColumns.stream() + .filter(columnHandle -> !columnHandle.isPartitionKey()) + .collect(Collectors.toList()); + TrinoFileSystem fileSystem = fileSystemFactory.create(session); + TrinoInputFile inputFile = fileSystem.newInputFile(path.toString(), split.getFileSize()); + ConnectorPageSource dataPageSource = createPageSource(session, regularColumns, split, inputFile, dataSourceStats, options, timeZone); + + return new HudiPageSource( + hiveColumns, + convertPartitionValues(hiveColumns, split.getPartitionKeys()), // create blocks for partition values + dataPageSource); + } + + private static ConnectorPageSource createPageSource( + ConnectorSession session, + List columns, + HudiSplit hudiSplit, + TrinoInputFile inputFile, + FileFormatDataSourceStats dataSourceStats, + ParquetReaderOptions options, + DateTimeZone timeZone) + { + ParquetDataSource dataSource = null; + boolean useColumnNames = shouldUseParquetColumnNames(session); + Path path = new Path(hudiSplit.getPath()); + long start = hudiSplit.getStart(); + long length = hudiSplit.getLength(); + try { + dataSource = new TrinoParquetDataSource(inputFile, options, dataSourceStats); + ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, Optional.empty()); + FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); + MessageType fileSchema = fileMetaData.getSchema(); + + Optional message = getParquetMessageType(columns, useColumnNames, fileSchema); + + MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of())); + MessageColumnIO messageColumn = getColumnIO(fileSchema, requestedSchema); + + Map, ColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema); + TupleDomain parquetTupleDomain = options.isIgnoreStatistics() + ? TupleDomain.all() + : getParquetTupleDomain(descriptorsByPath, hudiSplit.getPredicate(), fileSchema, useColumnNames); + + Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone); + + long nextStart = 0; + ImmutableList.Builder blocks = ImmutableList.builder(); + ImmutableList.Builder blockStarts = ImmutableList.builder(); + ImmutableList.Builder> columnIndexes = ImmutableList.builder(); + for (BlockMetaData block : parquetMetadata.getBlocks()) { + long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); + Optional columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options); + if (start <= firstDataPage && firstDataPage < start + length + && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex, timeZone)) { + blocks.add(block); + blockStarts.add(nextStart); + columnIndexes.add(columnIndex); + } + nextStart += block.getRowCount(); + } + + Optional readerProjections = projectBaseColumns(columns); + List baseColumns = readerProjections.map(projection -> + projection.get().stream() + .map(HiveColumnHandle.class::cast) + .collect(toUnmodifiableList())) + .orElse(columns); + List parquetReaderColumns = createParquetReaderColumns(baseColumns, fileSchema, messageColumn, useColumnNames); + ParquetDataSourceId dataSourceId = dataSource.getId(); + ParquetReader parquetReader = new ParquetReader( + Optional.ofNullable(fileMetaData.getCreatedBy()), + getParquetReaderFields(parquetReaderColumns), + blocks.build(), + blockStarts.build(), + dataSource, + timeZone, + newSimpleAggregatedMemoryContext(), + options, + exception -> handleException(dataSourceId, exception), + Optional.of(parquetPredicate), + columnIndexes.build(), + Optional.empty()); + + return new ParquetPageSource( + parquetReader, + parquetReaderColumns); + } + catch (IOException | RuntimeException e) { + try { + if (dataSource != null) { + dataSource.close(); + } + } + catch (IOException ignored) { + } + if (e instanceof TrinoException) { + throw (TrinoException) e; + } + String message = format("Error opening Hudi split %s (offset=%s, length=%s): %s", + path, start, length, e.getMessage()); + + if (e instanceof ParquetCorruptionException) { + throw new TrinoException(HUDI_BAD_DATA, message, e); + } + + if (e instanceof BlockMissingException) { + throw new TrinoException(HUDI_MISSING_DATA, message, e); + } + throw new TrinoException(HUDI_CANNOT_OPEN_SPLIT, message, e); + } + } + + private static TrinoException handleException(ParquetDataSourceId dataSourceId, Exception exception) + { + if (exception instanceof TrinoException) { + return (TrinoException) exception; + } + if (exception instanceof ParquetCorruptionException) { + return new TrinoException(HUDI_BAD_DATA, exception); + } + return new TrinoException(HUDI_CURSOR_ERROR, format("Failed to read Parquet file: %s", dataSourceId), exception); + } + + private Map convertPartitionValues( + List allColumns, + List partitionKeys) + { + return allColumns.stream() + .filter(HiveColumnHandle::isPartitionKey) + .collect(toMap( + HiveColumnHandle::getName, + columnHandle -> nativeValueToBlock( + columnHandle.getType(), + partitionToNativeValue( + columnHandle.getName(), + partitionKeys, + columnHandle.getType().getTypeSignature()).orElse(null)))); + } + + private static Optional partitionToNativeValue( + String partitionColumnName, + List partitionKeys, + TypeSignature partitionDataType) + { + HivePartitionKey partitionKey = partitionKeys.stream().filter(key -> key.getName().equalsIgnoreCase(partitionColumnName)).findFirst().orElse(null); + if (isNull(partitionKey)) { + return Optional.empty(); + } + + String partitionValue = partitionKey.getValue(); + String baseType = partitionDataType.getBase(); + try { + switch (baseType) { + case TINYINT: + case SMALLINT: + case INTEGER: + case BIGINT: + return Optional.of(parseLong(partitionValue)); + case REAL: + return Optional.of((long) floatToRawIntBits(parseFloat(partitionValue))); + case DOUBLE: + return Optional.of(parseDouble(partitionValue)); + case VARCHAR: + case VARBINARY: + return Optional.of(utf8Slice(partitionValue)); + case DATE: + return Optional.of(LocalDate.parse(partitionValue, DateTimeFormatter.ISO_LOCAL_DATE).toEpochDay()); + case TIMESTAMP: + return Optional.of(Timestamp.valueOf(partitionValue).toLocalDateTime().toEpochSecond(ZoneOffset.UTC) * 1_000); + case BOOLEAN: + checkArgument(partitionValue.equalsIgnoreCase("true") || partitionValue.equalsIgnoreCase("false")); + return Optional.of(Boolean.valueOf(partitionValue)); + case DECIMAL: + return Optional.of(Decimals.parse(partitionValue).getObject()); + default: + throw new TrinoException( + HUDI_INVALID_PARTITION_VALUE, + format("Unsupported data type '%s' for partition column %s", partitionDataType, partitionColumnName)); + } + } + catch (IllegalArgumentException | DateTimeParseException e) { + throw new TrinoException( + HUDI_INVALID_PARTITION_VALUE, + format("Can not parse partition value '%s' of type '%s' for partition column '%s'", partitionValue, partitionDataType, partitionColumnName), + e); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPlugin.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPlugin.java new file mode 100644 index 000000000000..1744105faddb --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPlugin.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.Plugin; +import io.trino.spi.connector.ConnectorFactory; + +public class HudiPlugin + implements Plugin +{ + @Override + public Iterable getConnectorFactories() + { + return ImmutableList.of(new HudiConnectorFactory()); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPredicates.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPredicates.java new file mode 100644 index 000000000000..16859e8e6dd3 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPredicates.java @@ -0,0 +1,68 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.TupleDomain; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +public class HudiPredicates +{ + private final TupleDomain partitionColumnPredicates; + private final TupleDomain regularColumnPredicates; + + public static HudiPredicates from(TupleDomain predicate) + { + Map partitionColumnPredicates = new HashMap<>(); + Map regularColumnPredicates = new HashMap<>(); + + Optional> domains = predicate.getDomains(); + domains.ifPresent(columnHandleDomainMap -> columnHandleDomainMap.forEach((key, value) -> { + HiveColumnHandle columnHandle = (HiveColumnHandle) key; + if (columnHandle.isPartitionKey()) { + partitionColumnPredicates.put(columnHandle, value); + } + else { + regularColumnPredicates.put(columnHandle, value); + } + })); + + return new HudiPredicates( + TupleDomain.withColumnDomains(partitionColumnPredicates), + TupleDomain.withColumnDomains(regularColumnPredicates)); + } + + private HudiPredicates( + TupleDomain partitionColumnPredicates, + TupleDomain regularColumnPredicates) + { + this.partitionColumnPredicates = partitionColumnPredicates; + this.regularColumnPredicates = regularColumnPredicates; + } + + public TupleDomain getPartitionColumnPredicates() + { + return partitionColumnPredicates; + } + + public TupleDomain getRegularColumnPredicates() + { + return regularColumnPredicates; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSessionProperties.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSessionProperties.java new file mode 100644 index 000000000000..355a5fd5180d --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSessionProperties.java @@ -0,0 +1,156 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.session.PropertyMetadata; +import io.trino.spi.type.ArrayType; + +import javax.inject.Inject; + +import java.util.Collection; +import java.util.List; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.plugin.base.session.PropertyMetadataUtil.dataSizeProperty; +import static io.trino.spi.StandardErrorCode.INVALID_SESSION_PROPERTY; +import static io.trino.spi.session.PropertyMetadata.booleanProperty; +import static io.trino.spi.session.PropertyMetadata.doubleProperty; +import static io.trino.spi.session.PropertyMetadata.integerProperty; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.String.format; +import static java.util.Locale.ENGLISH; + +public class HudiSessionProperties + implements SessionPropertiesProvider +{ + private static final String COLUMNS_TO_HIDE = "columns_to_hide"; + private static final String METADATA_ENABLED = "metadata_enabled"; + private static final String USE_PARQUET_COLUMN_NAMES = "use_parquet_column_names"; + private static final String MIN_PARTITION_BATCH_SIZE = "min_partition_batch_size"; + private static final String MAX_PARTITION_BATCH_SIZE = "max_partition_batch_size"; + private static final String SIZE_BASED_SPLIT_WEIGHTS_ENABLED = "size_based_split_weights_enabled"; + private static final String STANDARD_SPLIT_WEIGHT_SIZE = "standard_split_weight_size"; + private static final String MINIMUM_ASSIGNED_SPLIT_WEIGHT = "minimum_assigned_split_weight"; + + private final List> sessionProperties; + + @Inject + public HudiSessionProperties(HudiConfig hudiConfig) + { + sessionProperties = ImmutableList.of( + new PropertyMetadata<>( + COLUMNS_TO_HIDE, + "List of column names that will be hidden", + new ArrayType(VARCHAR), + List.class, + hudiConfig.getColumnsToHide(), + false, + value -> ((Collection) value).stream() + .map(name -> ((String) name).toLowerCase(ENGLISH)) + .collect(toImmutableList()), + value -> value), + booleanProperty( + METADATA_ENABLED, + "For Hudi tables prefer to fetch the list of files from its metadata", + hudiConfig.isMetadataEnabled(), + false), + booleanProperty( + USE_PARQUET_COLUMN_NAMES, + "Access parquet columns using names from the file. If disabled, then columns are accessed using index.", + hudiConfig.getUseParquetColumnNames(), + false), + integerProperty( + MIN_PARTITION_BATCH_SIZE, + "Minimum number of partitions returned in a single batch.", + hudiConfig.getMinPartitionBatchSize(), + false), + integerProperty( + MAX_PARTITION_BATCH_SIZE, + "Maximum number of partitions returned in a single batch.", + hudiConfig.getMaxPartitionBatchSize(), + false), + booleanProperty( + SIZE_BASED_SPLIT_WEIGHTS_ENABLED, + format("If enabled, size-based splitting ensures that each batch of splits has enough data to process as defined by %s", STANDARD_SPLIT_WEIGHT_SIZE), + hudiConfig.isSizeBasedSplitWeightsEnabled(), + false), + dataSizeProperty( + STANDARD_SPLIT_WEIGHT_SIZE, + "The split size corresponding to the standard weight (1.0) when size-based split weights are enabled", + hudiConfig.getStandardSplitWeightSize(), + false), + doubleProperty( + MINIMUM_ASSIGNED_SPLIT_WEIGHT, + "Minimum assigned split weight when size-based split weights are enabled", + hudiConfig.getMinimumAssignedSplitWeight(), + value -> { + if (!Double.isFinite(value) || value <= 0 || value > 1) { + throw new TrinoException(INVALID_SESSION_PROPERTY, format("%s must be > 0 and <= 1.0: %s", MINIMUM_ASSIGNED_SPLIT_WEIGHT, value)); + } + }, + false)); + } + + @Override + public List> getSessionProperties() + { + return sessionProperties; + } + + @SuppressWarnings("unchecked") + public static List getColumnsToHide(ConnectorSession session) + { + return (List) session.getProperty(COLUMNS_TO_HIDE, List.class); + } + + public static boolean isHudiMetadataEnabled(ConnectorSession session) + { + return session.getProperty(METADATA_ENABLED, Boolean.class); + } + + public static boolean shouldUseParquetColumnNames(ConnectorSession session) + { + return session.getProperty(USE_PARQUET_COLUMN_NAMES, Boolean.class); + } + + public static int getMinPartitionBatchSize(ConnectorSession session) + { + return session.getProperty(MIN_PARTITION_BATCH_SIZE, Integer.class); + } + + public static int getMaxPartitionBatchSize(ConnectorSession session) + { + return session.getProperty(MAX_PARTITION_BATCH_SIZE, Integer.class); + } + + public static boolean isSizeBasedSplitWeightsEnabled(ConnectorSession session) + { + return session.getProperty(SIZE_BASED_SPLIT_WEIGHTS_ENABLED, Boolean.class); + } + + public static DataSize getStandardSplitWeightSize(ConnectorSession session) + { + return session.getProperty(STANDARD_SPLIT_WEIGHT_SIZE, DataSize.class); + } + + public static double getMinimumAssignedSplitWeight(ConnectorSession session) + { + return session.getProperty(MINIMUM_ASSIGNED_SPLIT_WEIGHT, Double.class); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplit.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplit.java new file mode 100644 index 000000000000..956a757aa11b --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplit.java @@ -0,0 +1,147 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.spi.HostAddress; +import io.trino.spi.SplitWeight; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.predicate.TupleDomain; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class HudiSplit + implements ConnectorSplit +{ + private final String path; + private final long start; + private final long length; + private final long fileSize; + private final List addresses; + private final TupleDomain predicate; + private final List partitionKeys; + private final SplitWeight splitWeight; + + @JsonCreator + public HudiSplit( + @JsonProperty("path") String path, + @JsonProperty("start") long start, + @JsonProperty("length") long length, + @JsonProperty("fileSize") long fileSize, + @JsonProperty("addresses") List addresses, + @JsonProperty("predicate") TupleDomain predicate, + @JsonProperty("partitionKeys") List partitionKeys, + @JsonProperty("splitWeight") SplitWeight splitWeight) + { + checkArgument(start >= 0, "start must be positive"); + checkArgument(length >= 0, "length must be positive"); + checkArgument(start + length <= fileSize, "fileSize must be at least start + length"); + + this.path = requireNonNull(path, "path is null"); + this.start = start; + this.length = length; + this.fileSize = fileSize; + this.addresses = ImmutableList.copyOf(requireNonNull(addresses, "addresses is null")); + this.predicate = requireNonNull(predicate, "predicate is null"); + this.partitionKeys = ImmutableList.copyOf(requireNonNull(partitionKeys, "partitionKeys is null")); + this.splitWeight = requireNonNull(splitWeight, "splitWeight is null"); + } + + @Override + public boolean isRemotelyAccessible() + { + return true; + } + + @JsonProperty + @Override + public List getAddresses() + { + return addresses; + } + + @Override + public Object getInfo() + { + return ImmutableMap.builder() + .put("path", path) + .put("start", start) + .put("length", length) + .put("fileSize", fileSize) + .buildOrThrow(); + } + + @JsonProperty + @Override + public SplitWeight getSplitWeight() + { + return splitWeight; + } + + @JsonProperty + public String getPath() + { + return path; + } + + @JsonProperty + public long getStart() + { + return start; + } + + @JsonProperty + public long getLength() + { + return length; + } + + @JsonProperty + public long getFileSize() + { + return fileSize; + } + + @JsonProperty + public TupleDomain getPredicate() + { + return predicate; + } + + @JsonProperty + public List getPartitionKeys() + { + return partitionKeys; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(path) + .addValue(start) + .addValue(length) + .addValue(fileSize) + .toString(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitManager.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitManager.java new file mode 100644 index 000000000000..34afd7e042f3 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitManager.java @@ -0,0 +1,107 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import io.trino.hdfs.HdfsContext; +import io.trino.hdfs.HdfsEnvironment; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitSource; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HiveTransactionHandle; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplitManager; +import io.trino.spi.connector.ConnectorSplitSource; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.Constraint; +import io.trino.spi.connector.DynamicFilter; +import io.trino.spi.connector.TableNotFoundException; +import io.trino.spi.security.ConnectorIdentity; +import org.apache.hadoop.fs.Path; + +import javax.annotation.PreDestroy; +import javax.inject.Inject; + +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.function.BiFunction; + +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.spi.connector.SchemaTableName.schemaTableName; +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; + +public class HudiSplitManager + implements ConnectorSplitManager +{ + private final HudiTransactionManager transactionManager; + private final BiFunction metastoreProvider; + private final HdfsEnvironment hdfsEnvironment; + private final ExecutorService executor; + private final int maxSplitsPerSecond; + private final int maxOutstandingSplits; + + @Inject + public HudiSplitManager( + HudiTransactionManager transactionManager, + BiFunction metastoreProvider, + HdfsEnvironment hdfsEnvironment, + @ForHudiSplitManager ExecutorService executor, + HudiConfig hudiConfig) + { + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.metastoreProvider = requireNonNull(metastoreProvider, "metastoreProvider is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.executor = requireNonNull(executor, "executor is null"); + this.maxSplitsPerSecond = requireNonNull(hudiConfig, "hudiConfig is null").getMaxSplitsPerSecond(); + this.maxOutstandingSplits = hudiConfig.getMaxOutstandingSplits(); + } + + @PreDestroy + public void destroy() + { + this.executor.shutdown(); + } + + @Override + public ConnectorSplitSource getSplits( + ConnectorTransactionHandle transaction, + ConnectorSession session, + ConnectorTableHandle tableHandle, + DynamicFilter dynamicFilter, + Constraint constraint) + { + HudiTableHandle hudiTableHandle = (HudiTableHandle) tableHandle; + HudiMetadata hudiMetadata = transactionManager.get(transaction, session.getIdentity()); + Map partitionColumnHandles = hudiMetadata.getColumnHandles(session, tableHandle) + .values().stream().map(HiveColumnHandle.class::cast) + .filter(HiveColumnHandle::isPartitionKey) + .collect(toImmutableMap(HiveColumnHandle::getName, identity())); + HiveMetastore metastore = metastoreProvider.apply(session.getIdentity(), (HiveTransactionHandle) transaction); + Table table = metastore.getTable(hudiTableHandle.getSchemaName(), hudiTableHandle.getTableName()) + .orElseThrow(() -> new TableNotFoundException(schemaTableName(hudiTableHandle.getSchemaName(), hudiTableHandle.getTableName()))); + HudiSplitSource splitSource = new HudiSplitSource( + session, + metastore, + table, + hudiTableHandle, + hdfsEnvironment.getConfiguration(new HdfsContext(session), new Path(table.getStorage().getLocation())), + partitionColumnHandles, + executor, + maxSplitsPerSecond, + maxOutstandingSplits); + return new ClassLoaderSafeConnectorSplitSource(splitSource, HudiSplitManager.class.getClassLoader()); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitSource.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitSource.java new file mode 100644 index 000000000000..2726bbb741d0 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiSplitSource.java @@ -0,0 +1,147 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.util.concurrent.Futures; +import io.airlift.units.DataSize; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hive.util.AsyncQueue; +import io.trino.plugin.hive.util.ThrottledAsyncQueue; +import io.trino.plugin.hudi.query.HudiDirectoryLister; +import io.trino.plugin.hudi.query.HudiReadOptimizedDirectoryLister; +import io.trino.plugin.hudi.split.HudiBackgroundSplitLoader; +import io.trino.plugin.hudi.split.HudiSplitWeightProvider; +import io.trino.plugin.hudi.split.SizeBasedSplitWeightProvider; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorSplitSource; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicReference; + +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.airlift.concurrent.MoreFutures.toCompletableFuture; +import static io.trino.plugin.hudi.HudiSessionProperties.getMinimumAssignedSplitWeight; +import static io.trino.plugin.hudi.HudiSessionProperties.getStandardSplitWeightSize; +import static io.trino.plugin.hudi.HudiSessionProperties.isHudiMetadataEnabled; +import static io.trino.plugin.hudi.HudiSessionProperties.isSizeBasedSplitWeightsEnabled; +import static io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; +import static java.util.stream.Collectors.toList; + +public class HudiSplitSource + implements ConnectorSplitSource +{ + private final AsyncQueue queue; + private final AtomicReference trinoException = new AtomicReference<>(); + + public HudiSplitSource( + ConnectorSession session, + HiveMetastore metastore, + Table table, + HudiTableHandle tableHandle, + Configuration configuration, + Map partitionColumnHandleMap, + ExecutorService executor, + int maxSplitsPerSecond, + int maxOutstandingSplits) + { + boolean metadataEnabled = isHudiMetadataEnabled(session); + HoodieTableMetaClient metaClient = buildTableMetaClient(configuration, tableHandle.getBasePath()); + HoodieEngineContext engineContext = new HoodieLocalEngineContext(configuration); + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() + .enable(metadataEnabled) + .build(); + List partitionColumnHandles = table.getPartitionColumns().stream() + .map(column -> partitionColumnHandleMap.get(column.getName())).collect(toList()); + + HudiDirectoryLister hudiDirectoryLister = new HudiReadOptimizedDirectoryLister( + metadataConfig, + engineContext, + tableHandle, + metaClient, + metastore, + table, + partitionColumnHandles); + + this.queue = new ThrottledAsyncQueue<>(maxSplitsPerSecond, maxOutstandingSplits, executor); + HudiBackgroundSplitLoader splitLoader = new HudiBackgroundSplitLoader( + session, + tableHandle, + hudiDirectoryLister, + queue, + executor, + createSplitWeightProvider(session), + throwable -> { + trinoException.compareAndSet(null, new TrinoException(GENERIC_INTERNAL_ERROR, + "Failed to generate splits for " + table.getTableName(), throwable)); + queue.finish(); + }); + splitLoader.start(); + } + + @Override + public CompletableFuture getNextBatch(int maxSize) + { + boolean noMoreSplits = isFinished(); + Throwable throwable = trinoException.get(); + if (throwable != null) { + return CompletableFuture.failedFuture(throwable); + } + + return toCompletableFuture(Futures.transform( + queue.getBatchAsync(maxSize), + splits -> new ConnectorSplitBatch(splits, noMoreSplits), + directExecutor())); + } + + @Override + public void close() + { + queue.finish(); + } + + @Override + public boolean isFinished() + { + return queue.isFinished(); + } + + private static HoodieTableMetaClient buildTableMetaClient(Configuration configuration, String basePath) + { + HoodieTableMetaClient client = HoodieTableMetaClient.builder().setConf(configuration).setBasePath(basePath).build(); + client.getTableConfig().setValue("hoodie.bootstrap.index.enable", "false"); + return client; + } + + private static HudiSplitWeightProvider createSplitWeightProvider(ConnectorSession session) + { + if (isSizeBasedSplitWeightsEnabled(session)) { + DataSize standardSplitWeightSize = getStandardSplitWeightSize(session); + double minimumAssignedSplitWeight = getMinimumAssignedSplitWeight(session); + return new SizeBasedSplitWeightProvider(minimumAssignedSplitWeight, standardSplitWeightSize); + } + return HudiSplitWeightProvider.uniformStandardWeightProvider(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableHandle.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableHandle.java new file mode 100644 index 000000000000..7b092288fc68 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableHandle.java @@ -0,0 +1,113 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.spi.connector.ConnectorTableHandle; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hudi.common.model.HoodieTableType; + +import static io.trino.spi.connector.SchemaTableName.schemaTableName; +import static java.util.Objects.requireNonNull; + +public class HudiTableHandle + implements ConnectorTableHandle +{ + private final String schemaName; + private final String tableName; + private final String basePath; + private final HoodieTableType tableType; + private final TupleDomain partitionPredicates; + private final TupleDomain regularPredicates; + + @JsonCreator + public HudiTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("basePath") String basePath, + @JsonProperty("tableType") HoodieTableType tableType, + @JsonProperty("partitionPredicates") TupleDomain partitionPredicates, + @JsonProperty("regularPredicates") TupleDomain regularPredicates) + { + this.schemaName = requireNonNull(schemaName, "schemaName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.basePath = requireNonNull(basePath, "basePath is null"); + this.tableType = requireNonNull(tableType, "tableType is null"); + this.partitionPredicates = requireNonNull(partitionPredicates, "partitionPredicates is null"); + this.regularPredicates = requireNonNull(regularPredicates, "regularPredicates is null"); + } + + @JsonProperty + public String getSchemaName() + { + return schemaName; + } + + @JsonProperty + public String getTableName() + { + return tableName; + } + + @JsonProperty + public String getBasePath() + { + return basePath; + } + + @JsonProperty + public HoodieTableType getTableType() + { + return tableType; + } + + @JsonProperty + public TupleDomain getPartitionPredicates() + { + return partitionPredicates; + } + + @JsonProperty + public TupleDomain getRegularPredicates() + { + return regularPredicates; + } + + public SchemaTableName getSchemaTableName() + { + return schemaTableName(schemaName, tableName); + } + + HudiTableHandle applyPredicates( + TupleDomain partitionTupleDomain, + TupleDomain regularTupleDomain) + { + return new HudiTableHandle( + schemaName, + tableName, + basePath, + tableType, + partitionPredicates.intersect(partitionTupleDomain), + regularPredicates.intersect(regularTupleDomain)); + } + + @Override + public String toString() + { + return getSchemaTableName().toString(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableInfo.java new file mode 100644 index 000000000000..cdffb35b59a5 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableInfo.java @@ -0,0 +1,65 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.trino.spi.connector.SchemaTableName; + +import static java.util.Objects.requireNonNull; + +public class HudiTableInfo +{ + private final SchemaTableName table; + private final String tableType; + private final String basePath; + + @JsonCreator + public HudiTableInfo( + @JsonProperty("table") SchemaTableName table, + @JsonProperty("tableType") String tableType, + @JsonProperty("basePath") String basePath) + { + this.table = requireNonNull(table, "table is null"); + this.tableType = requireNonNull(tableType, "tableType is null"); + this.basePath = requireNonNull(basePath, "basePath is null"); + } + + @JsonProperty + public SchemaTableName getTable() + { + return table; + } + + @JsonProperty + public String getTableType() + { + return tableType; + } + + @JsonProperty + public String getBasePath() + { + return basePath; + } + + public static HudiTableInfo from(HudiTableHandle tableHandle) + { + requireNonNull(tableHandle, "tableHandle is null"); + return new HudiTableInfo( + tableHandle.getSchemaTableName(), + tableHandle.getTableType().name(), + tableHandle.getBasePath()); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableProperties.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableProperties.java new file mode 100644 index 000000000000..2e9323d90e8c --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTableProperties.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.session.PropertyMetadata; +import io.trino.spi.type.ArrayType; + +import javax.inject.Inject; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import static io.trino.spi.session.PropertyMetadata.stringProperty; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Locale.ENGLISH; + +public class HudiTableProperties +{ + public static final String LOCATION_PROPERTY = "location"; + public static final String PARTITIONED_BY_PROPERTY = "partitioned_by"; + + private final List> tableProperties; + + @Inject + public HudiTableProperties() + { + tableProperties = ImmutableList.>builder() + .add(stringProperty( + LOCATION_PROPERTY, + "File system location URI for the table", + null, + false)) + .add(new PropertyMetadata<>( + PARTITIONED_BY_PROPERTY, + "Partition columns", + new ArrayType(VARCHAR), + List.class, + ImmutableList.of(), + false, + value -> ImmutableList.copyOf(((Collection) value).stream() + .map(name -> name.toLowerCase(ENGLISH)) + .collect(Collectors.toList())), + value -> value)) + .build(); + } + + public List> getTableProperties() + { + return tableProperties; + } + + public static Optional getTableLocation(Map tableProperties) + { + return Optional.ofNullable((String) tableProperties.get(LOCATION_PROPERTY)); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTransactionManager.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTransactionManager.java new file mode 100644 index 000000000000..6bd2b9257e1e --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiTransactionManager.java @@ -0,0 +1,81 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.security.ConnectorIdentity; + +import javax.annotation.concurrent.GuardedBy; +import javax.inject.Inject; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static java.util.Objects.requireNonNull; + +public class HudiTransactionManager +{ + private final Map transactions = new ConcurrentHashMap<>(); + private final HudiMetadataFactory metadataFactory; + + @Inject + public HudiTransactionManager(HudiMetadataFactory metadataFactory) + { + this.metadataFactory = requireNonNull(metadataFactory, "metadataFactory is null"); + } + + public HudiMetadata get(ConnectorTransactionHandle transaction, ConnectorIdentity identity) + { + HudiMetadata metadata = transactions.get(transaction).get(identity); + checkArgument(metadata != null, "no such transaction: %s", transaction); + return metadata; + } + + public void commit(ConnectorTransactionHandle transaction) + { + MemoizedMetadata metadata = transactions.remove(transaction); + checkArgument(metadata != null, "no such transaction: %s", transaction); + } + + public void rollback(ConnectorTransactionHandle transaction) + { + MemoizedMetadata transactionalMetadata = transactions.remove(transaction); + checkArgument(transactionalMetadata != null, "no such transaction: %s", transaction); + } + + public void put(ConnectorTransactionHandle transaction) + { + MemoizedMetadata existing = transactions.putIfAbsent(transaction, new MemoizedMetadata()); + checkState(existing == null, "transaction already exists: %s", existing); + } + + private class MemoizedMetadata + { + @GuardedBy("this") + private HudiMetadata metadata; + + public synchronized HudiMetadata get(ConnectorIdentity identity) + { + if (metadata == null) { + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + metadata = metadataFactory.create(identity); + } + } + return metadata; + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiUtil.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiUtil.java new file mode 100644 index 000000000000..51d61aa132e8 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiUtil.java @@ -0,0 +1,164 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartition; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.HivePartitionManager; +import io.trino.plugin.hive.metastore.Column; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.NullableValue; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.Type; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; +import static io.trino.plugin.hive.util.HiveUtil.checkCondition; +import static io.trino.plugin.hive.util.HiveUtil.parsePartitionValue; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_CANNOT_OPEN_SPLIT; +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_UNSUPPORTED_FILE_FORMAT; +import static java.util.stream.Collectors.toList; + +public final class HudiUtil +{ + private HudiUtil() {} + + public static boolean isHudiParquetInputFormat(InputFormat inputFormat) + { + return inputFormat instanceof HoodieParquetInputFormat; + } + + public static HoodieFileFormat getHudiFileFormat(String path) + { + final String extension = FSUtils.getFileExtension(path); + if (extension.equals(HoodieFileFormat.PARQUET.getFileExtension())) { + return HoodieFileFormat.PARQUET; + } + if (extension.equals(HoodieFileFormat.HOODIE_LOG.getFileExtension())) { + return HoodieFileFormat.HOODIE_LOG; + } + if (extension.equals(HoodieFileFormat.ORC.getFileExtension())) { + return HoodieFileFormat.ORC; + } + if (extension.equals(HoodieFileFormat.HFILE.getFileExtension())) { + return HoodieFileFormat.HFILE; + } + throw new TrinoException(HUDI_UNSUPPORTED_FILE_FORMAT, "Hoodie InputFormat not implemented for base file of type " + extension); + } + + public static boolean partitionMatchesPredicates( + SchemaTableName tableName, + String hivePartitionName, + List partitionColumnHandles, + TupleDomain constraintSummary) + { + List partitionColumnTypes = partitionColumnHandles.stream() + .map(HiveColumnHandle::getType) + .collect(toList()); + HivePartition partition = HivePartitionManager.parsePartition( + tableName, hivePartitionName, partitionColumnHandles, partitionColumnTypes); + + return partitionMatches(partitionColumnHandles, constraintSummary, partition); + } + + public static boolean partitionMatchesPredicates( + SchemaTableName tableName, + String relativePartitionPath, + List partitionValues, + List partitionColumnHandles, + TupleDomain constraintSummary) + { + List partitionColumnTypes = partitionColumnHandles.stream() + .map(HiveColumnHandle::getType) + .collect(toList()); + HivePartition partition = parsePartition( + tableName, relativePartitionPath, partitionValues, partitionColumnHandles, partitionColumnTypes); + + return partitionMatches(partitionColumnHandles, constraintSummary, partition); + } + + private static HivePartition parsePartition( + SchemaTableName tableName, + String partitionName, + List partitionValues, + List partitionColumns, + List partitionColumnTypes) + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i = 0; i < partitionColumns.size(); i++) { + HiveColumnHandle column = partitionColumns.get(i); + NullableValue parsedValue = parsePartitionValue( + partitionName, partitionValues.get(i), partitionColumnTypes.get(i)); + builder.put(column, parsedValue); + } + Map values = builder.buildOrThrow(); + return new HivePartition(tableName, partitionName, values); + } + + public static boolean partitionMatches(List partitionColumns, TupleDomain constraintSummary, HivePartition partition) + { + if (constraintSummary.isNone()) { + return false; + } + Map domains = constraintSummary.getDomains().orElseGet(ImmutableMap::of); + for (HiveColumnHandle column : partitionColumns) { + NullableValue value = partition.getKeys().get(column); + Domain allowedDomain = domains.get(column); + if (allowedDomain != null && !allowedDomain.includesNullableValue(value.getValue())) { + return false; + } + } + return true; + } + + public static List buildPartitionKeys(List keys, List values) + { + checkCondition(keys.size() == values.size(), HIVE_INVALID_METADATA, + "Expected %s partition key values, but got %s. Keys: %s, Values: %s.", + keys.size(), values.size(), keys, values); + ImmutableList.Builder partitionKeys = ImmutableList.builder(); + for (int i = 0; i < keys.size(); i++) { + String name = keys.get(i).getName(); + String value = values.get(i); + partitionKeys.add(new HivePartitionKey(name, value)); + } + return partitionKeys.build(); + } + + public static FileStatus getFileStatus(HoodieBaseFile baseFile) + { + try { + return HoodieInputFormatUtils.getFileStatus(baseFile); + } + catch (IOException e) { + throw new TrinoException(HUDI_CANNOT_OPEN_SPLIT, "Error getting file status of " + baseFile.getPath(), e); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/InternalHudiConnectorFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/InternalHudiConnectorFactory.java new file mode 100644 index 000000000000..304e4b54b2c3 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/InternalHudiConnectorFactory.java @@ -0,0 +1,107 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableSet; +import com.google.inject.Injector; +import com.google.inject.Key; +import com.google.inject.TypeLiteral; +import io.airlift.bootstrap.Bootstrap; +import io.airlift.bootstrap.LifeCycleManager; +import io.airlift.event.client.EventModule; +import io.airlift.json.JsonModule; +import io.trino.filesystem.hdfs.HdfsFileSystemModule; +import io.trino.hdfs.HdfsModule; +import io.trino.hdfs.authentication.HdfsAuthenticationModule; +import io.trino.plugin.base.CatalogName; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorPageSourceProvider; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitManager; +import io.trino.plugin.base.classloader.ClassLoaderSafeNodePartitioningProvider; +import io.trino.plugin.base.jmx.MBeanServerModule; +import io.trino.plugin.base.session.SessionPropertiesProvider; +import io.trino.plugin.hive.azure.HiveAzureModule; +import io.trino.plugin.hive.gcs.HiveGcsModule; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.HiveMetastoreModule; +import io.trino.plugin.hive.s3.HiveS3Module; +import io.trino.spi.NodeManager; +import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorContext; +import io.trino.spi.connector.ConnectorNodePartitioningProvider; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorSplitManager; +import io.trino.spi.type.TypeManager; +import org.weakref.jmx.guice.MBeanModule; + +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +public class InternalHudiConnectorFactory +{ + private InternalHudiConnectorFactory() {} + + public static Connector createConnector( + String catalogName, + Map config, + ConnectorContext context, + Optional metastore) + { + ClassLoader classLoader = InternalHudiConnectorFactory.class.getClassLoader(); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { + Bootstrap app = new Bootstrap( + new EventModule(), + new MBeanModule(), + new JsonModule(), + new HudiModule(), + new HiveMetastoreModule(metastore), + new HdfsModule(), + new HiveS3Module(), + new HiveGcsModule(), + new HiveAzureModule(), + new HdfsAuthenticationModule(), + new HdfsFileSystemModule(), + new MBeanServerModule(), + binder -> { + binder.bind(NodeManager.class).toInstance(context.getNodeManager()); + binder.bind(TypeManager.class).toInstance(context.getTypeManager()); + binder.bind(CatalogName.class).toInstance(new CatalogName(catalogName)); + }); + + Injector injector = app + .doNotInitializeLogging() + .setRequiredConfigurationProperties(config) + .initialize(); + + LifeCycleManager lifeCycleManager = injector.getInstance(LifeCycleManager.class); + HudiTransactionManager transactionManager = injector.getInstance(HudiTransactionManager.class); + ConnectorSplitManager splitManager = injector.getInstance(ConnectorSplitManager.class); + ConnectorPageSourceProvider connectorPageSource = injector.getInstance(ConnectorPageSourceProvider.class); + ConnectorNodePartitioningProvider connectorDistributionProvider = injector.getInstance(ConnectorNodePartitioningProvider.class); + Set sessionPropertiesProviders = injector.getInstance(Key.get(new TypeLiteral>() {})); + HudiTableProperties hudiTableProperties = injector.getInstance(HudiTableProperties.class); + + return new HudiConnector( + lifeCycleManager, + transactionManager, + new ClassLoaderSafeConnectorSplitManager(splitManager, classLoader), + new ClassLoaderSafeConnectorPageSourceProvider(connectorPageSource, classLoader), + new ClassLoaderSafeNodePartitioningProvider(connectorDistributionProvider, classLoader), + ImmutableSet.of(), + sessionPropertiesProviders, + hudiTableProperties.getTableProperties()); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HiveHudiPartitionInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HiveHudiPartitionInfo.java new file mode 100644 index 000000000000..cf3596092da6 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HiveHudiPartitionInfo.java @@ -0,0 +1,131 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.partition; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hive.util.HiveUtil; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.exception.HoodieIOException; + +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.trino.plugin.hudi.HudiUtil.buildPartitionKeys; +import static io.trino.plugin.hudi.HudiUtil.partitionMatchesPredicates; +import static java.lang.String.format; + +public class HiveHudiPartitionInfo + implements HudiPartitionInfo +{ + private final Table table; + private final List partitionColumnHandles; + private final TupleDomain constraintSummary; + private final String hivePartitionName; + private final List partitionColumns; + private final HiveMetastore hiveMetastore; + private String relativePartitionPath; + private List hivePartitionKeys; + + public HiveHudiPartitionInfo( + String hivePartitionName, + List partitionColumns, + List partitionColumnHandles, + TupleDomain constraintSummary, + Table table, + HiveMetastore hiveMetastore) + { + this.table = table; + this.partitionColumnHandles = partitionColumnHandles; + this.constraintSummary = constraintSummary; + this.hivePartitionName = hivePartitionName; + this.partitionColumns = partitionColumns; + if (partitionColumns.isEmpty()) { + this.relativePartitionPath = ""; + this.hivePartitionKeys = Collections.emptyList(); + } + this.hiveMetastore = hiveMetastore; + } + + @Override + public Table getTable() + { + return null; + } + + @Override + public String getRelativePartitionPath() + { + if (relativePartitionPath == null) { + loadPartitionInfo(hiveMetastore.getPartition(table, HiveUtil.toPartitionValues(hivePartitionName))); + } + return relativePartitionPath; + } + + @Override + public String getHivePartitionName() + { + return hivePartitionName; + } + + @Override + public List getHivePartitionKeys() + { + if (hivePartitionKeys == null) { + loadPartitionInfo(hiveMetastore.getPartition(table, HiveUtil.toPartitionValues(hivePartitionName))); + } + return hivePartitionKeys; + } + + @Override + public boolean doesMatchPredicates() + { + return partitionMatchesPredicates(table.getSchemaTableName(), hivePartitionName, partitionColumnHandles, constraintSummary); + } + + @Override + public String getComparingKey() + { + return hivePartitionName; + } + + @Override + public void loadPartitionInfo(Optional partition) + { + if (partition.isEmpty()) { + throw new HoodieIOException(format("Cannot find partition in Hive Metastore: %s", hivePartitionName)); + } + this.relativePartitionPath = FSUtils.getRelativePartitionPath( + new Path(table.getStorage().getLocation()), + new Path(partition.get().getStorage().getLocation())); + this.hivePartitionKeys = buildPartitionKeys(partitionColumns, partition.get().getValues()); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("hivePartitionName", hivePartitionName) + .add("hivePartitionKeys", hivePartitionKeys) + .toString(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfo.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfo.java new file mode 100644 index 000000000000..df71e9642735 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfo.java @@ -0,0 +1,38 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.partition; + +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Table; + +import java.util.List; +import java.util.Optional; + +public interface HudiPartitionInfo +{ + Table getTable(); + + String getRelativePartitionPath(); + + String getHivePartitionName(); + + List getHivePartitionKeys(); + + boolean doesMatchPredicates(); + + String getComparingKey(); + + void loadPartitionInfo(Optional partition); +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoLoader.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoLoader.java new file mode 100644 index 000000000000..bf501a777d3d --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/partition/HudiPartitionInfoLoader.java @@ -0,0 +1,122 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.partition; + +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hudi.query.HudiDirectoryLister; +import io.trino.spi.connector.ConnectorSession; +import org.apache.hudi.exception.HoodieIOException; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Deque; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.stream.Collectors; + +import static io.trino.plugin.hudi.HudiSessionProperties.getMaxPartitionBatchSize; +import static io.trino.plugin.hudi.HudiSessionProperties.getMinPartitionBatchSize; + +public class HudiPartitionInfoLoader + implements Runnable +{ + private final HudiDirectoryLister hudiDirectoryLister; + private final int minPartitionBatchSize; + private final int maxPartitionBatchSize; + private final Deque partitionQueue; + private int currentBatchSize; + + public HudiPartitionInfoLoader( + ConnectorSession session, + HudiDirectoryLister hudiDirectoryLister) + { + this.hudiDirectoryLister = hudiDirectoryLister; + this.partitionQueue = new ConcurrentLinkedDeque<>(); + this.minPartitionBatchSize = getMinPartitionBatchSize(session); + this.maxPartitionBatchSize = getMaxPartitionBatchSize(session); + this.currentBatchSize = -1; + } + + @Override + public void run() + { + List hudiPartitionInfoList = hudiDirectoryLister.getPartitionsToScan().stream() + .sorted(Comparator.comparing(HudiPartitionInfo::getComparingKey)) + .collect(Collectors.toList()); + + // empty partitioned table + if (hudiPartitionInfoList.isEmpty()) { + return; + } + + // non-partitioned table + if (hudiPartitionInfoList.size() == 1 && hudiPartitionInfoList.get(0).getHivePartitionName().isEmpty()) { + partitionQueue.addAll(hudiPartitionInfoList); + return; + } + + boolean shouldUseHiveMetastore = hudiPartitionInfoList.get(0) instanceof HiveHudiPartitionInfo; + Iterator iterator = hudiPartitionInfoList.iterator(); + while (iterator.hasNext()) { + int batchSize = updateBatchSize(); + List partitionInfoBatch = new ArrayList<>(); + while (iterator.hasNext() && batchSize > 0) { + partitionInfoBatch.add(iterator.next()); + batchSize--; + } + + if (!partitionInfoBatch.isEmpty()) { + if (shouldUseHiveMetastore) { + Map> partitions = hudiDirectoryLister.getPartitions(partitionInfoBatch.stream() + .map(HudiPartitionInfo::getHivePartitionName) + .collect(Collectors.toList())); + for (HudiPartitionInfo partitionInfo : partitionInfoBatch) { + String hivePartitionName = partitionInfo.getHivePartitionName(); + if (!partitions.containsKey(hivePartitionName)) { + throw new HoodieIOException("Partition does not exist: " + hivePartitionName); + } + partitionInfo.loadPartitionInfo(partitions.get(hivePartitionName)); + partitionQueue.add(partitionInfo); + } + } + else { + for (HudiPartitionInfo partitionInfo : partitionInfoBatch) { + partitionInfo.getHivePartitionKeys(); + partitionQueue.add(partitionInfo); + } + } + } + } + } + + public Deque getPartitionQueue() + { + return partitionQueue; + } + + private int updateBatchSize() + { + if (currentBatchSize <= 0) { + currentBatchSize = minPartitionBatchSize; + } + else { + currentBatchSize *= 2; + currentBatchSize = Math.min(currentBatchSize, maxPartitionBatchSize); + } + return currentBatchSize; + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiDirectoryLister.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiDirectoryLister.java new file mode 100644 index 000000000000..401e0f35e844 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiDirectoryLister.java @@ -0,0 +1,33 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.query; + +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hudi.partition.HudiPartitionInfo; +import org.apache.hadoop.fs.FileStatus; + +import java.io.Closeable; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +public interface HudiDirectoryLister + extends Closeable +{ + List getPartitionsToScan(); + + List listStatus(HudiPartitionInfo partitionInfo); + + Map> getPartitions(List partitionNames); +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiReadOptimizedDirectoryLister.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiReadOptimizedDirectoryLister.java new file mode 100644 index 000000000000..92aad499ce63 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/query/HudiReadOptimizedDirectoryLister.java @@ -0,0 +1,131 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.query; + +import io.trino.plugin.hive.HiveColumnHandle; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.MetastoreUtil; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.plugin.hudi.partition.HiveHudiPartitionInfo; +import io.trino.plugin.hudi.partition.HudiPartitionInfo; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.TableNotFoundException; +import io.trino.spi.predicate.TupleDomain; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.plugin.hudi.HudiUtil.getFileStatus; + +public class HudiReadOptimizedDirectoryLister + implements HudiDirectoryLister +{ + private final HudiTableHandle tableHandle; + private final HiveMetastore hiveMetastore; + private final Table hiveTable; + private final SchemaTableName tableName; + private final List partitionColumnHandles; + private final HoodieTableFileSystemView fileSystemView; + private final TupleDomain partitionKeysFilter; + private final List partitionColumns; + + private List hivePartitionNames; + + public HudiReadOptimizedDirectoryLister( + HoodieMetadataConfig metadataConfig, + HoodieEngineContext engineContext, + HudiTableHandle tableHandle, + HoodieTableMetaClient metaClient, + HiveMetastore hiveMetastore, + Table hiveTable, + List partitionColumnHandles) + { + this.tableHandle = tableHandle; + this.tableName = tableHandle.getSchemaTableName(); + this.hiveMetastore = hiveMetastore; + this.hiveTable = hiveTable; + this.partitionColumnHandles = partitionColumnHandles; + this.fileSystemView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, metadataConfig); + this.partitionKeysFilter = MetastoreUtil.computePartitionKeyFilter(partitionColumnHandles, tableHandle.getPartitionPredicates()); + this.partitionColumns = hiveTable.getPartitionColumns(); + } + + @Override + public List getPartitionsToScan() + { + if (hivePartitionNames == null) { + hivePartitionNames = partitionColumns.isEmpty() + ? Collections.singletonList("") + : getPartitionNamesFromHiveMetastore(partitionKeysFilter); + } + + List allPartitionInfoList = hivePartitionNames.stream() + .map(hivePartitionName -> new HiveHudiPartitionInfo( + hivePartitionName, + partitionColumns, + partitionColumnHandles, + tableHandle.getPartitionPredicates(), + hiveTable, + hiveMetastore)) + .collect(Collectors.toList()); + + return allPartitionInfoList.stream() + .filter(partitionInfo -> partitionInfo.getHivePartitionKeys().isEmpty() || partitionInfo.doesMatchPredicates()) + .collect(Collectors.toList()); + } + + @Override + public List listStatus(HudiPartitionInfo partitionInfo) + { + return fileSystemView.getLatestBaseFiles(partitionInfo.getRelativePartitionPath()) + .map(baseFile -> getFileStatus(baseFile)) + .collect(toImmutableList()); + } + + private List getPartitionNamesFromHiveMetastore(TupleDomain partitionKeysFilter) + { + return hiveMetastore.getPartitionNamesByFilter( + tableName.getSchemaName(), + tableName.getTableName(), + partitionColumns.stream().map(Column::getName).collect(Collectors.toList()), + partitionKeysFilter).orElseThrow(() -> new TableNotFoundException(tableHandle.getSchemaTableName())); + } + + @Override + public Map> getPartitions(List partitionNames) + { + return hiveMetastore.getPartitionsByNames(hiveTable, partitionNames); + } + + @Override + public void close() + { + if (fileSystemView != null && !fileSystemView.isClosed()) { + fileSystemView.close(); + } + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiBackgroundSplitLoader.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiBackgroundSplitLoader.java new file mode 100644 index 000000000000..b9ca3cbe60d2 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiBackgroundSplitLoader.java @@ -0,0 +1,115 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.split; + +import com.google.common.util.concurrent.FutureCallback; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import io.airlift.concurrent.MoreFutures; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hive.util.AsyncQueue; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.plugin.hudi.partition.HudiPartitionInfo; +import io.trino.plugin.hudi.partition.HudiPartitionInfoLoader; +import io.trino.plugin.hudi.query.HudiDirectoryLister; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import org.apache.hadoop.fs.FileStatus; + +import java.util.Collection; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static java.util.Objects.requireNonNull; + +public class HudiBackgroundSplitLoader +{ + private final ConnectorSession session; + private final HudiDirectoryLister hudiDirectoryLister; + private final AsyncQueue asyncQueue; + private final ExecutorService executor; + private final Consumer errorListener; + private final HudiSplitFactory hudiSplitFactory; + + public HudiBackgroundSplitLoader( + ConnectorSession session, + HudiTableHandle tableHandle, + HudiDirectoryLister hudiDirectoryLister, + AsyncQueue asyncQueue, + ExecutorService executor, + HudiSplitWeightProvider hudiSplitWeightProvider, + Consumer errorListener) + { + this.session = requireNonNull(session, "session is null"); + this.hudiDirectoryLister = requireNonNull(hudiDirectoryLister, "hudiDirectoryLister is null"); + this.asyncQueue = requireNonNull(asyncQueue, "asyncQueue is null"); + this.executor = requireNonNull(executor, "executor is null"); + this.errorListener = requireNonNull(errorListener, "errorListener is null"); + this.hudiSplitFactory = new HudiSplitFactory(tableHandle, hudiSplitWeightProvider); + } + + public void start() + { + ListenableFuture> partitionsFuture = Futures.submit(this::loadPartitions, executor); + hookErrorListener(partitionsFuture); + + ListenableFuture splitFutures = Futures.transform( + partitionsFuture, + partitions -> { + List> futures = partitions.stream() + .map(partition -> Futures.submit(() -> loadSplits(partition), executor)) + .peek(this::hookErrorListener) + .collect(Collectors.toList()); + Futures.whenAllComplete(futures).run(asyncQueue::finish, directExecutor()); + return null; + }, + directExecutor()); + hookErrorListener(splitFutures); + } + + private Collection loadPartitions() + { + HudiPartitionInfoLoader partitionInfoLoader = new HudiPartitionInfoLoader(session, hudiDirectoryLister); + partitionInfoLoader.run(); + return partitionInfoLoader.getPartitionQueue(); + } + + private void loadSplits(HudiPartitionInfo partition) + { + List partitionKeys = partition.getHivePartitionKeys(); + List partitionFiles = hudiDirectoryLister.listStatus(partition); + partitionFiles.stream() + .flatMap(fileStatus -> hudiSplitFactory.createSplits(partitionKeys, fileStatus)) + .map(asyncQueue::offer) + .forEachOrdered(MoreFutures::getFutureValue); + } + + private void hookErrorListener(ListenableFuture future) + { + Futures.addCallback(future, new FutureCallback() + { + @Override + public void onSuccess(T result) {} + + @Override + public void onFailure(Throwable t) + { + errorListener.accept(t); + } + }, directExecutor()); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitFactory.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitFactory.java new file mode 100644 index 000000000000..8e874fbcfad2 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitFactory.java @@ -0,0 +1,106 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.split; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.hive.HivePartitionKey; +import io.trino.plugin.hudi.HudiSplit; +import io.trino.plugin.hudi.HudiTableHandle; +import io.trino.spi.TrinoException; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.hadoop.PathWithBootstrapFileStatus; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Stream; + +import static io.trino.plugin.hudi.HudiErrorCode.HUDI_CANNOT_OPEN_SPLIT; +import static java.util.Objects.requireNonNull; + +public class HudiSplitFactory +{ + private static final double SPLIT_SLOP = 1.1; // 10% slop/overflow allowed in bytes per split while generating splits + + private final HudiTableHandle hudiTableHandle; + private final HudiSplitWeightProvider hudiSplitWeightProvider; + + public HudiSplitFactory( + HudiTableHandle hudiTableHandle, + HudiSplitWeightProvider hudiSplitWeightProvider) + { + this.hudiTableHandle = requireNonNull(hudiTableHandle, "hudiTableHandle is null"); + this.hudiSplitWeightProvider = requireNonNull(hudiSplitWeightProvider, "hudiSplitWeightProvider is null"); + } + + public Stream createSplits(List partitionKeys, FileStatus fileStatus) + { + List splits; + try { + splits = createSplits(fileStatus); + } + catch (IOException e) { + throw new TrinoException(HUDI_CANNOT_OPEN_SPLIT, e); + } + + return splits.stream() + .map(fileSplit -> new HudiSplit( + fileSplit.getPath().toString(), + fileSplit.getStart(), + fileSplit.getLength(), + fileStatus.getLen(), + ImmutableList.of(), + hudiTableHandle.getRegularPredicates(), + partitionKeys, + hudiSplitWeightProvider.calculateSplitWeight(fileSplit.getLength()))); + } + + private List createSplits(FileStatus fileStatus) + throws IOException + { + if (fileStatus.isDirectory()) { + throw new IOException("Not a file: " + fileStatus.getPath()); + } + + Path path = fileStatus.getPath(); + long length = fileStatus.getLen(); + + if (length == 0) { + return ImmutableList.of(new FileSplit(path, 0, 0, new String[0])); + } + + if (!isSplitable(path)) { + return ImmutableList.of(new FileSplit(path, 0, length, (String[]) null)); + } + + ImmutableList.Builder splits = ImmutableList.builder(); + long splitSize = fileStatus.getBlockSize(); + + long bytesRemaining = length; + while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { + splits.add(new FileSplit(path, length - bytesRemaining, splitSize, (String[]) null)); + bytesRemaining -= splitSize; + } + if (bytesRemaining != 0) { + splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, (String[]) null)); + } + return splits.build(); + } + + private static boolean isSplitable(Path filename) + { + return !(filename instanceof PathWithBootstrapFileStatus); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitWeightProvider.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitWeightProvider.java new file mode 100644 index 000000000000..dd90dd356340 --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/HudiSplitWeightProvider.java @@ -0,0 +1,26 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.split; + +import io.trino.spi.SplitWeight; + +public interface HudiSplitWeightProvider +{ + SplitWeight calculateSplitWeight(long splitSizeInBytes); + + static HudiSplitWeightProvider uniformStandardWeightProvider() + { + return splitSizeInBytes -> SplitWeight.standard(); + } +} diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/SizeBasedSplitWeightProvider.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/SizeBasedSplitWeightProvider.java new file mode 100644 index 000000000000..b7abb9c4d7eb --- /dev/null +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/split/SizeBasedSplitWeightProvider.java @@ -0,0 +1,47 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.split; + +import io.airlift.units.DataSize; +import io.trino.spi.SplitWeight; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.primitives.Doubles.constrainToRange; +import static java.util.Objects.requireNonNull; + +public class SizeBasedSplitWeightProvider + implements HudiSplitWeightProvider +{ + private final double minimumWeight; + private final double standardSplitSizeInBytes; + + public SizeBasedSplitWeightProvider(double minimumWeight, DataSize standardSplitSize) + { + checkArgument( + Double.isFinite(minimumWeight) && minimumWeight > 0 && minimumWeight <= 1, + "minimumWeight must be > 0 and <= 1, found: %s", minimumWeight); + this.minimumWeight = minimumWeight; + long standardSplitSizeInBytesLong = requireNonNull(standardSplitSize, "standardSplitSize is null").toBytes(); + checkArgument(standardSplitSizeInBytesLong > 0, "standardSplitSize must be > 0, found: %s", standardSplitSize); + this.standardSplitSizeInBytes = (double) standardSplitSizeInBytesLong; + } + + @Override + public SplitWeight calculateSplitWeight(long splitSizeInBytes) + { + double computedWeight = splitSizeInBytes / standardSplitSizeInBytes; + // Clamp the value between the minimum weight and 1.0 (standard weight) + return SplitWeight.fromProportion(constrainToRange(computedWeight, minimumWeight, 1.0)); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/BaseHudiConnectorTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/BaseHudiConnectorTest.java new file mode 100644 index 000000000000..2314d9dd0fb4 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/BaseHudiConnectorTest.java @@ -0,0 +1,108 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hudi.testing.TpchHudiTablesInitializer; +import io.trino.testing.BaseConnectorTest; +import io.trino.testing.QueryRunner; +import io.trino.testing.TestingConnectorBehavior; +import org.apache.hudi.common.model.HoodieTableType; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +import static io.trino.plugin.hudi.HudiQueryRunner.createHudiQueryRunner; +import static org.apache.hudi.common.model.HoodieRecord.HOODIE_META_COLUMNS; +import static org.assertj.core.api.Assertions.assertThat; + +public abstract class BaseHudiConnectorTest + extends BaseConnectorTest +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return createHudiQueryRunner( + ImmutableMap.of(), + ImmutableMap.of("hudi.columns-to-hide", columnsToHide()), + new TpchHudiTablesInitializer(getHoodieTableType(), REQUIRED_TPCH_TABLES)); + } + + @Override + protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior) + { + switch (connectorBehavior) { + // Optimizer + case SUPPORTS_TOPN_PUSHDOWN: + return false; + + // DDL and DML on schemas and tables + case SUPPORTS_CREATE_SCHEMA: + case SUPPORTS_CREATE_TABLE: + case SUPPORTS_CREATE_TABLE_WITH_DATA: + case SUPPORTS_COMMENT_ON_TABLE: + case SUPPORTS_RENAME_TABLE: + return false; + + // DDL and DML on columns + case SUPPORTS_ADD_COLUMN: + case SUPPORTS_ADD_COLUMN_WITH_COMMENT: + case SUPPORTS_COMMENT_ON_COLUMN: + case SUPPORTS_RENAME_COLUMN: + return false; + + // Writing capabilities + case SUPPORTS_DELETE: + case SUPPORTS_INSERT: + case SUPPORTS_MULTI_STATEMENT_WRITES: + return false; + + default: + return super.hasBehavior(connectorBehavior); + } + } + + @Test + @Override + public void testShowCreateTable() + { + assertThat((String) computeActual("SHOW CREATE TABLE orders").getOnlyValue()) + .matches("CREATE TABLE \\w+\\.\\w+\\.orders \\Q(\n" + + " orderkey bigint,\n" + + " custkey bigint,\n" + + " orderstatus varchar(1),\n" + + " totalprice double,\n" + + " orderdate date,\n" + + " orderpriority varchar(15),\n" + + " clerk varchar(15),\n" + + " shippriority integer,\n" + + " comment varchar(79)\n" + + ")\n" + + "WITH (\n" + + " location = \\E'.*/orders'\n\\Q" + + ")"); + } + + protected abstract HoodieTableType getHoodieTableType(); + + static String columnsToHide() + { + List columns = new ArrayList<>(HOODIE_META_COLUMNS.size() + 1); + columns.addAll(HOODIE_META_COLUMNS); + columns.add(TpchHudiTablesInitializer.FIELD_UUID); + return String.join(",", columns); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/BaseHudiMinioConnectorTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/BaseHudiMinioConnectorTest.java new file mode 100644 index 000000000000..a1a5fb138a78 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/BaseHudiMinioConnectorTest.java @@ -0,0 +1,115 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hive.containers.HiveMinioDataLake; +import io.trino.plugin.hudi.testing.TpchHudiTablesInitializer; +import io.trino.testing.BaseConnectorTest; +import io.trino.testing.QueryRunner; +import io.trino.testing.TestingConnectorBehavior; +import org.apache.hudi.common.model.HoodieTableType; +import org.testng.annotations.Test; + +import static io.trino.plugin.hudi.BaseHudiConnectorTest.columnsToHide; +import static io.trino.plugin.hudi.S3HudiQueryRunner.HUDI_MINIO_TESTS; +import static io.trino.testing.sql.TestTable.randomTableSuffix; +import static java.util.Objects.requireNonNull; +import static org.assertj.core.api.Assertions.assertThat; + +public abstract class BaseHudiMinioConnectorTest + extends BaseConnectorTest +{ + private final HoodieTableType tableType; + protected HiveMinioDataLake hiveMinioDataLake; + + public BaseHudiMinioConnectorTest(HoodieTableType tableType) + { + this.tableType = requireNonNull(tableType, "tableType is null"); + } + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + String bucketName = "test-hudi-connector-" + randomTableSuffix(); + hiveMinioDataLake = closeAfterClass(new HiveMinioDataLake(bucketName)); + hiveMinioDataLake.start(); + hiveMinioDataLake.getMinioClient().ensureBucketExists(bucketName); + + return S3HudiQueryRunner.create( + HUDI_MINIO_TESTS.getCatalogName(), + HUDI_MINIO_TESTS.getSchemaName(), + ImmutableMap.builder() + .put("hudi.columns-to-hide", columnsToHide()) + .buildOrThrow(), + new TpchHudiTablesInitializer(tableType, REQUIRED_TPCH_TABLES), + hiveMinioDataLake); + } + + @Override + protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior) + { + switch (connectorBehavior) { + // Optimizer + case SUPPORTS_TOPN_PUSHDOWN: + return false; + + // DDL and DML on schemas and tables + case SUPPORTS_CREATE_SCHEMA: + case SUPPORTS_CREATE_TABLE: + case SUPPORTS_CREATE_TABLE_WITH_DATA: + case SUPPORTS_COMMENT_ON_TABLE: + case SUPPORTS_RENAME_TABLE: + return false; + + // DDL and DML on columns + case SUPPORTS_ADD_COLUMN: + case SUPPORTS_ADD_COLUMN_WITH_COMMENT: + case SUPPORTS_COMMENT_ON_COLUMN: + case SUPPORTS_RENAME_COLUMN: + return false; + + // Writing capabilities + case SUPPORTS_DELETE: + case SUPPORTS_INSERT: + case SUPPORTS_MULTI_STATEMENT_WRITES: + return false; + + default: + return super.hasBehavior(connectorBehavior); + } + } + + @Test + @Override + public void testShowCreateTable() + { + assertThat((String) computeActual("SHOW CREATE TABLE orders").getOnlyValue()) + .matches("CREATE TABLE \\w+\\.\\w+\\.orders \\Q(\n" + + " orderkey bigint,\n" + + " custkey bigint,\n" + + " orderstatus varchar(1),\n" + + " totalprice double,\n" + + " orderdate date,\n" + + " orderpriority varchar(15),\n" + + " clerk varchar(15),\n" + + " shippriority integer,\n" + + " comment varchar(79)\n" + + ")\n" + + "WITH (\n" + + " location = \\E'.*/orders'\n\\Q" + + ")"); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/HudiQueryRunner.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/HudiQueryRunner.java new file mode 100644 index 000000000000..4126813a6179 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/HudiQueryRunner.java @@ -0,0 +1,109 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.airlift.log.Logger; +import io.airlift.log.Logging; +import io.trino.Session; +import io.trino.plugin.hive.SchemaAlreadyExistsException; +import io.trino.plugin.hive.metastore.Database; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hudi.testing.HudiTablesInitializer; +import io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer; +import io.trino.spi.connector.CatalogSchemaName; +import io.trino.spi.security.PrincipalType; +import io.trino.testing.DistributedQueryRunner; + +import java.io.File; +import java.nio.file.Path; +import java.util.Map; +import java.util.Optional; + +import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; +import static io.trino.plugin.hive.metastore.file.FileHiveMetastore.createTestingFileHiveMetastore; +import static io.trino.testing.TestingSession.testSessionBuilder; + +public final class HudiQueryRunner +{ + public static final CatalogSchemaName HUDI_TESTS = new CatalogSchemaName("hudi", "tests"); + + private HudiQueryRunner() {} + + public static DistributedQueryRunner createHudiQueryRunner( + Map serverConfig, + Map connectorConfig, + HudiTablesInitializer dataLoader) + throws Exception + { + Session session = testSessionBuilder() + .setCatalog(HUDI_TESTS.getCatalogName()) + .setSchema(HUDI_TESTS.getSchemaName()) + .build(); + DistributedQueryRunner queryRunner = DistributedQueryRunner + .builder(session) + .setExtraProperties(serverConfig) + .build(); + + Path coordinatorBaseDir = queryRunner.getCoordinator().getBaseDataDir(); + File catalogDir = coordinatorBaseDir.resolve("catalog").toFile(); + HiveMetastore metastore = createTestingFileHiveMetastore(catalogDir); + + // create testing database + Database database = Database.builder() + .setDatabaseName(HUDI_TESTS.getSchemaName()) + .setOwnerName(Optional.of("public")) + .setOwnerType(Optional.of(PrincipalType.ROLE)) + .build(); + try { + metastore.createDatabase(database); + } + catch (SchemaAlreadyExistsException e) { + // do nothing if database already exists + } + + queryRunner.installPlugin(new TestingHudiPlugin(Optional.of(metastore))); + queryRunner.createCatalog( + "hudi", + HUDI_TESTS.getCatalogName(), + connectorConfig); + + String dataDir = coordinatorBaseDir.resolve("data").toString(); + dataLoader.initializeTables(queryRunner, metastore, HUDI_TESTS, dataDir, newEmptyConfiguration()); + return queryRunner; + } + + public static void main(String[] args) + throws InterruptedException + { + Logging.initialize(); + Logger log = Logger.get(HudiQueryRunner.class); + + DistributedQueryRunner queryRunner = null; + try (DistributedQueryRunner runner = createHudiQueryRunner( + ImmutableMap.of("http-server.http.port", "8080"), + ImmutableMap.of(), + new ResourceHudiTablesInitializer())) { + queryRunner = runner; + } + catch (Throwable t) { + log.error(t); + System.exit(1); + } + Thread.sleep(100); + + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/S3HudiQueryRunner.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/S3HudiQueryRunner.java new file mode 100644 index 000000000000..b64fce498e15 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/S3HudiQueryRunner.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.log.Logger; +import io.airlift.log.Logging; +import io.trino.hdfs.DynamicHdfsConfiguration; +import io.trino.hdfs.HdfsConfig; +import io.trino.hdfs.HdfsConfigurationInitializer; +import io.trino.hdfs.HdfsContext; +import io.trino.hdfs.HdfsEnvironment; +import io.trino.hdfs.authentication.NoHdfsAuthentication; +import io.trino.plugin.hive.SchemaAlreadyExistsException; +import io.trino.plugin.hive.containers.HiveMinioDataLake; +import io.trino.plugin.hive.metastore.Database; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.thrift.BridgingHiveMetastore; +import io.trino.plugin.hive.s3.HiveS3Config; +import io.trino.plugin.hive.s3.TrinoS3ConfigurationInitializer; +import io.trino.plugin.hudi.testing.HudiTablesInitializer; +import io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer; +import io.trino.spi.connector.CatalogSchemaName; +import io.trino.spi.security.PrincipalType; +import io.trino.testing.DistributedQueryRunner; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import java.util.Map; +import java.util.Optional; + +import static io.trino.plugin.hive.HiveTestUtils.SOCKS_PROXY; +import static io.trino.plugin.hive.TestingThriftHiveMetastoreBuilder.testingThriftHiveMetastoreBuilder; +import static io.trino.plugin.hive.containers.HiveMinioDataLake.MINIO_ACCESS_KEY; +import static io.trino.plugin.hive.containers.HiveMinioDataLake.MINIO_SECRET_KEY; +import static io.trino.testing.DistributedQueryRunner.builder; +import static io.trino.testing.TestingConnectorSession.SESSION; +import static io.trino.testing.TestingSession.testSessionBuilder; + +public final class S3HudiQueryRunner +{ + static final CatalogSchemaName HUDI_MINIO_TESTS = new CatalogSchemaName("hudi", "miniotests"); + private static final HdfsContext CONTEXT = new HdfsContext(SESSION); + + private S3HudiQueryRunner() {} + + public static DistributedQueryRunner create( + String catalogName, + String schemaName, + Map connectorProperties, + HudiTablesInitializer dataLoader, + HiveMinioDataLake hiveMinioDataLake) + throws Exception + { + return create( + catalogName, + schemaName, + ImmutableMap.of(), + connectorProperties, + dataLoader, + hiveMinioDataLake); + } + + public static DistributedQueryRunner create( + String catalogName, + String schemaName, + Map extraProperties, + Map connectorProperties, + HudiTablesInitializer dataLoader, + HiveMinioDataLake hiveMinioDataLake) + throws Exception + { + String basePath = "s3a://" + hiveMinioDataLake.getBucketName() + "/" + schemaName; + HdfsEnvironment hdfsEnvironment = getHdfsEnvironment(hiveMinioDataLake); + Configuration configuration = hdfsEnvironment.getConfiguration(CONTEXT, new Path(basePath)); + + HiveMetastore metastore = new BridgingHiveMetastore( + testingThriftHiveMetastoreBuilder() + .metastoreClient(hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint()) + .hdfsEnvironment(hdfsEnvironment) + .build()); + Database database = Database.builder() + .setDatabaseName(schemaName) + .setOwnerName(Optional.of("public")) + .setOwnerType(Optional.of(PrincipalType.ROLE)) + .build(); + try { + metastore.createDatabase(database); + } + catch (SchemaAlreadyExistsException e) { + // do nothing if database already exists + } + + DistributedQueryRunner queryRunner = builder( + testSessionBuilder() + .setCatalog(catalogName) + .setSchema(schemaName) + .build()) + .setExtraProperties(extraProperties) + .build(); + queryRunner.installPlugin(new TestingHudiPlugin(Optional.of(metastore))); + queryRunner.createCatalog( + catalogName, + "hudi", + ImmutableMap.builder() + .put("hive.s3.aws-access-key", MINIO_ACCESS_KEY) + .put("hive.s3.aws-secret-key", MINIO_SECRET_KEY) + .put("hive.s3.endpoint", hiveMinioDataLake.getMinioAddress()) + .put("hive.s3.path-style-access", "true") + .putAll(connectorProperties) + .buildOrThrow()); + + dataLoader.initializeTables(queryRunner, metastore, HUDI_MINIO_TESTS, basePath, configuration); + return queryRunner; + } + + private static HdfsEnvironment getHdfsEnvironment(HiveMinioDataLake hiveMinioDataLake) + { + DynamicHdfsConfiguration dynamicHdfsConfiguration = new DynamicHdfsConfiguration( + new HdfsConfigurationInitializer( + new HdfsConfig() + .setSocksProxy(SOCKS_PROXY.orElse(null)), + ImmutableSet.of( + new TrinoS3ConfigurationInitializer(new HiveS3Config() + .setS3AwsAccessKey(MINIO_ACCESS_KEY) + .setS3AwsSecretKey(MINIO_SECRET_KEY) + .setS3Endpoint(hiveMinioDataLake.getMinioAddress()) + .setS3PathStyleAccess(true)))), + ImmutableSet.of()); + + return new HdfsEnvironment( + dynamicHdfsConfiguration, + new HdfsConfig(), + new NoHdfsAuthentication()); + } + + public static void main(String[] args) + throws InterruptedException + { + Logging.initialize(); + Logger log = Logger.get(S3HudiQueryRunner.class); + + String bucketName = "test-bucket"; + HiveMinioDataLake hiveMinioDataLake = new HiveMinioDataLake(bucketName); + hiveMinioDataLake.start(); + + DistributedQueryRunner queryRunner = null; + try (DistributedQueryRunner runner = create( + HUDI_MINIO_TESTS.getCatalogName(), + HUDI_MINIO_TESTS.getSchemaName(), + ImmutableMap.of("http-server.http.port", "8080"), + ImmutableMap.of(), + new ResourceHudiTablesInitializer(), + hiveMinioDataLake)) { + queryRunner = runner; + } + catch (Throwable t) { + log.error(t); + System.exit(1); + } + Thread.sleep(100); + + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConfig.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConfig.java new file mode 100644 index 000000000000..e4e9f8ba237c --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConfig.java @@ -0,0 +1,75 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.airlift.units.DataSize; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static io.airlift.units.DataSize.Unit.MEGABYTE; + +public class TestHudiConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(HudiConfig.class) + .setColumnsToHide(null) + .setMetadataEnabled(false) + .setUseParquetColumnNames(true) + .setMinPartitionBatchSize(10) + .setMaxPartitionBatchSize(100) + .setSizeBasedSplitWeightsEnabled(true) + .setStandardSplitWeightSize(DataSize.of(128, MEGABYTE)) + .setMinimumAssignedSplitWeight(0.05) + .setMaxSplitsPerSecond(Integer.MAX_VALUE) + .setMaxOutstandingSplits(1000)); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = ImmutableMap.builder() + .put("hudi.columns-to-hide", "_hoodie_record_key") + .put("hudi.metadata-enabled", "true") + .put("hudi.parquet.use-column-names", "false") + .put("hudi.min-partition-batch-size", "5") + .put("hudi.max-partition-batch-size", "50") + .put("hudi.size-based-split-weights-enabled", "false") + .put("hudi.standard-split-weight-size", "64MB") + .put("hudi.minimum-assigned-split-weight", "0.1") + .put("hudi.max-splits-per-second", "100") + .put("hudi.max-outstanding-splits", "100") + .buildOrThrow(); + + HudiConfig expected = new HudiConfig() + .setColumnsToHide("_hoodie_record_key") + .setMetadataEnabled(true) + .setUseParquetColumnNames(false) + .setMinPartitionBatchSize(5) + .setMaxPartitionBatchSize(50) + .setSizeBasedSplitWeightsEnabled(false) + .setStandardSplitWeightSize(DataSize.of(64, MEGABYTE)) + .setMinimumAssignedSplitWeight(0.1) + .setMaxSplitsPerSecond(100) + .setMaxOutstandingSplits(100); + + assertFullMapping(properties, expected); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorFactory.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorFactory.java new file mode 100644 index 000000000000..36e417b44788 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorFactory.java @@ -0,0 +1,70 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorMetadata; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitManager; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorFactory; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.testing.TestingConnectorContext; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.testing.Assertions.assertInstanceOf; +import static io.trino.spi.transaction.IsolationLevel.READ_UNCOMMITTED; +import static io.trino.testing.TestingConnectorSession.SESSION; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class TestHudiConnectorFactory +{ + @Test + public void testCreateConnector() + { + assertCreateConnector("thrift://localhost:1234"); + assertCreateConnector("thrift://localhost:1234,thrift://192.0.2.3:5678"); + + assertCreateConnectorFails("abc", "metastoreUri scheme is missing: abc"); + assertCreateConnectorFails("thrift://:8090", "metastoreUri host is missing: thrift://:8090"); + assertCreateConnectorFails("thrift://localhost", "metastoreUri port is missing: thrift://localhost"); + assertCreateConnectorFails("abc::", "metastoreUri scheme must be thrift: abc::"); + assertCreateConnectorFails("", "metastoreUris must specify at least one URI"); + assertCreateConnectorFails("thrift://localhost:1234,thrift://test-1", "metastoreUri port is missing: thrift://test-1"); + } + + private static void assertCreateConnector(String metastoreUri) + { + Map config = ImmutableMap.builder() + .put("hive.metastore.uri", metastoreUri) + .buildOrThrow(); + + ConnectorFactory factory = new HudiConnectorFactory(); + Connector connector = factory.create("test", config, new TestingConnectorContext()); + ConnectorTransactionHandle transaction = connector.beginTransaction(READ_UNCOMMITTED, true, true); + assertInstanceOf(connector.getMetadata(SESSION, transaction), ClassLoaderSafeConnectorMetadata.class); + assertInstanceOf(connector.getSplitManager(), ClassLoaderSafeConnectorSplitManager.class); + assertInstanceOf(connector.getPageSourceProvider(), ConnectorPageSourceProvider.class); + connector.commit(transaction); + } + + private static void assertCreateConnectorFails(String metastoreUri, String exceptionString) + { + assertThatThrownBy(() -> assertCreateConnector(metastoreUri)) + .isInstanceOf(RuntimeException.class) + .hasMessageContaining(exceptionString); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorMetadataTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorMetadataTest.java new file mode 100644 index 000000000000..e5ae56bae2bb --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorMetadataTest.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer; +import io.trino.testing.QueryRunner; + +import static io.trino.plugin.hudi.HudiQueryRunner.createHudiQueryRunner; + +public class TestHudiConnectorMetadataTest + extends TestHudiSmokeTest +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return createHudiQueryRunner(ImmutableMap.of(), ImmutableMap.of("hudi.metadata-enabled", "true"), new ResourceHudiTablesInitializer()); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorParquetColumnNamesTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorParquetColumnNamesTest.java new file mode 100644 index 000000000000..40f987495fb0 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiConnectorParquetColumnNamesTest.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer; +import io.trino.testing.QueryRunner; + +import static io.trino.plugin.hudi.HudiQueryRunner.createHudiQueryRunner; + +public class TestHudiConnectorParquetColumnNamesTest + extends TestHudiSmokeTest +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return createHudiQueryRunner(ImmutableMap.of(), ImmutableMap.of("hudi.parquet.use-column-names", "false"), new ResourceHudiTablesInitializer()); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiCopyOnWriteConnectorTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiCopyOnWriteConnectorTest.java new file mode 100644 index 000000000000..5ac74a957fc3 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiCopyOnWriteConnectorTest.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import org.apache.hudi.common.model.HoodieTableType; + +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; + +public class TestHudiCopyOnWriteConnectorTest + extends BaseHudiConnectorTest +{ + @Override + protected HoodieTableType getHoodieTableType() + { + return COPY_ON_WRITE; + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiCopyOnWriteMinioConnectorTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiCopyOnWriteMinioConnectorTest.java new file mode 100644 index 000000000000..9202e3491729 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiCopyOnWriteMinioConnectorTest.java @@ -0,0 +1,25 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; + +public class TestHudiCopyOnWriteMinioConnectorTest + extends BaseHudiMinioConnectorTest +{ + public TestHudiCopyOnWriteMinioConnectorTest() + { + super(COPY_ON_WRITE); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiMergeOnReadConnectorTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiMergeOnReadConnectorTest.java new file mode 100644 index 000000000000..f8acaecd8cf0 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiMergeOnReadConnectorTest.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import org.apache.hudi.common.model.HoodieTableType; + +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; + +public class TestHudiMergeOnReadConnectorTest + extends BaseHudiConnectorTest +{ + @Override + protected HoodieTableType getHoodieTableType() + { + return MERGE_ON_READ; + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiMergeOnReadMinioConnectorTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiMergeOnReadMinioConnectorTest.java new file mode 100644 index 000000000000..b80b7b57f74a --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiMergeOnReadMinioConnectorTest.java @@ -0,0 +1,25 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; + +public class TestHudiMergeOnReadMinioConnectorTest + extends BaseHudiMinioConnectorTest +{ + public TestHudiMergeOnReadMinioConnectorTest() + { + super(MERGE_ON_READ); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSessionProperties.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSessionProperties.java new file mode 100644 index 000000000000..5220921922fa --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSessionProperties.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.connector.ConnectorSession; +import io.trino.testing.TestingConnectorSession; +import org.testng.annotations.Test; + +import java.util.List; + +import static io.trino.plugin.hudi.HudiSessionProperties.getColumnsToHide; +import static org.testng.Assert.assertEqualsNoOrder; + +public class TestHudiSessionProperties +{ + @Test + public void testSessionPropertyColumnsToHide() + { + HudiConfig config = new HudiConfig() + .setColumnsToHide("col1, col2"); + HudiSessionProperties sessionProperties = new HudiSessionProperties(config); + ConnectorSession session = TestingConnectorSession.builder() + .setPropertyMetadata(sessionProperties.getSessionProperties()) + .build(); + List expectedColumnsToHide = ImmutableList.of("col1", "col2"); + List actualColumnsToHide = getColumnsToHide(session); + assertEqualsNoOrder(expectedColumnsToHide.toArray(), actualColumnsToHide.toArray()); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSmokeTest.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSmokeTest.java new file mode 100644 index 000000000000..737b4d1bf954 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiSmokeTest.java @@ -0,0 +1,116 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.QueryRunner; +import org.testng.annotations.Test; + +import static io.trino.plugin.hudi.HudiQueryRunner.createHudiQueryRunner; +import static io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer.TestingTable.HUDI_COW_PT_TBL; +import static io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer.TestingTable.HUDI_NON_PART_COW; +import static io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer.TestingTable.STOCK_TICKS_COW; +import static io.trino.plugin.hudi.testing.ResourceHudiTablesInitializer.TestingTable.STOCK_TICKS_MOR; +import static org.assertj.core.api.Assertions.assertThat; + +public class TestHudiSmokeTest + extends AbstractTestQueryFramework +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return createHudiQueryRunner(ImmutableMap.of(), ImmutableMap.of(), new ResourceHudiTablesInitializer()); + } + + @Test + public void readNonPartitionedTable() + { + assertQuery( + "SELECT rowid, name FROM " + HUDI_NON_PART_COW, + "SELECT * FROM VALUES ('row_1', 'bob'), ('row_2', 'john'), ('row_3', 'tom')"); + } + + @Test + public void readPartitionedTables() + { + assertQuery("SELECT symbol, max(ts) FROM " + STOCK_TICKS_COW + " GROUP BY symbol HAVING symbol = 'GOOG'", + "SELECT * FROM VALUES ('GOOG', '2018-08-31 10:59:00')"); + + assertQuery("SELECT symbol, max(ts) FROM " + STOCK_TICKS_MOR + " GROUP BY symbol HAVING symbol = 'GOOG'", + "SELECT * FROM VALUES ('GOOG', '2018-08-31 10:59:00')"); + + assertQuery("SELECT dt, count(1) FROM " + STOCK_TICKS_MOR + " GROUP BY dt", + "SELECT * FROM VALUES ('2018-08-31', '99')"); + } + + @Test + public void testMultiPartitionedTable() + { + assertQuery("SELECT _hoodie_partition_path, id, name, ts, dt, hh FROM " + HUDI_COW_PT_TBL + " WHERE id = 1", + "SELECT * FROM VALUES ('dt=2021-12-09/hh=10', 1, 'a1', 1000, '2021-12-09', '10')"); + assertQuery("SELECT _hoodie_partition_path, id, name, ts, dt, hh FROM " + HUDI_COW_PT_TBL + " WHERE id = 2", + "SELECT * FROM VALUES ('dt=2021-12-09/hh=11', 2, 'a2', 1000, '2021-12-09', '11')"); + } + + @Test + public void testShowCreateTable() + { + assertThat((String) computeActual("SHOW CREATE TABLE " + STOCK_TICKS_COW).getOnlyValue()) + .matches("CREATE TABLE \\w+\\.\\w+\\.stock_ticks_cow \\Q(\n" + + " _hoodie_commit_time varchar,\n" + + " _hoodie_commit_seqno varchar,\n" + + " _hoodie_record_key varchar,\n" + + " _hoodie_partition_path varchar,\n" + + " _hoodie_file_name varchar,\n" + + " volume bigint,\n" + + " ts varchar,\n" + + " symbol varchar,\n" + + " year integer,\n" + + " month varchar,\n" + + " high double,\n" + + " low double,\n" + + " key varchar,\n" + + " date varchar,\n" + + " close double,\n" + + " open double,\n" + + " day varchar,\n" + + " dt varchar\n" + + ")\n" + + "WITH (\n" + + " location = \\E'.*/stock_ticks_cow',\n\\Q" + + " partitioned_by = ARRAY['dt']\n" + + ")"); + // multi-partitioned table + assertThat((String) computeActual("SHOW CREATE TABLE " + HUDI_COW_PT_TBL).getOnlyValue()) + .matches("CREATE TABLE \\w+\\.\\w+\\.hudi_cow_pt_tbl \\Q(\n" + + " _hoodie_commit_time varchar,\n" + + " _hoodie_commit_seqno varchar,\n" + + " _hoodie_record_key varchar,\n" + + " _hoodie_partition_path varchar,\n" + + " _hoodie_file_name varchar,\n" + + " id bigint,\n" + + " name varchar,\n" + + " ts bigint,\n" + + " dt varchar,\n" + + " hh varchar\n" + + ")\n" + + "WITH (\n" + + " location = \\E'.*/hudi_cow_pt_tbl',\n\\Q" + + " partitioned_by = ARRAY['dt','hh']\n" + + ")"); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiUtil.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiUtil.java new file mode 100644 index 000000000000..226ca77afb60 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestHudiUtil.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Properties; + +import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; +import static io.trino.plugin.hive.HiveStorageFormat.PARQUET; +import static io.trino.plugin.hive.util.HiveUtil.getInputFormat; +import static io.trino.plugin.hudi.HudiUtil.isHudiParquetInputFormat; +import static org.apache.hadoop.hive.common.FileUtils.unescapePathName; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class TestHudiUtil +{ + @Test + public void testIsHudiParquetInputFormat() + { + Properties schema = new Properties(); + schema.setProperty(FILE_INPUT_FORMAT, HoodieParquetInputFormat.class.getName()); + schema.setProperty(SERIALIZATION_LIB, PARQUET.getSerde()); + + assertTrue(isHudiParquetInputFormat(getInputFormat(newEmptyConfiguration(), schema, false))); + } + + @Test + public void testBuildPartitionValues() + { + assertToPartitionValues("partitionColumn1=01/01/2020", ImmutableList.of("01/01/2020")); + assertToPartitionValues("partitionColumn1=01/01/2020/partitioncolumn2=abc", ImmutableList.of("01/01/2020", "abc")); + assertToPartitionValues("ds=2015-12-30/event_type=QueryCompletion", ImmutableList.of("2015-12-30", "QueryCompletion")); + assertToPartitionValues("ds=2015-12-30", ImmutableList.of("2015-12-30")); + assertToPartitionValues("a=1", ImmutableList.of("1")); + assertToPartitionValues("a=1/b=2/c=3", ImmutableList.of("1", "2", "3")); + assertToPartitionValues("pk=!@%23$%25%5E&%2A()%2F%3D", ImmutableList.of("!@#$%^&*()/=")); + assertToPartitionValues("pk=__HIVE_DEFAULT_PARTITION__", ImmutableList.of("__HIVE_DEFAULT_PARTITION__")); + } + + private static void assertToPartitionValues(String partitionName, List expected) + { + List actual = buildPartitionValues(partitionName); + assertEquals(actual, expected); + } + + private static List buildPartitionValues(String partitionNames) + { + ImmutableList.Builder values = ImmutableList.builder(); + String[] parts = partitionNames.split("="); + if (parts.length == 1) { + values.add(unescapePathName(partitionNames)); + return values.build(); + } + if (parts.length == 2) { + values.add(unescapePathName(parts[1])); + return values.build(); + } + for (int i = 1; i < parts.length; i++) { + String val = parts[i]; + int j = val.lastIndexOf('/'); + if (j == -1) { + values.add(unescapePathName(val)); + } + else { + values.add(unescapePathName(val.substring(0, j))); + } + } + return values.build(); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestingHudiConnectorFactory.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestingHudiConnectorFactory.java new file mode 100644 index 000000000000..6009f199ad03 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestingHudiConnectorFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.spi.connector.Connector; +import io.trino.spi.connector.ConnectorContext; +import io.trino.spi.connector.ConnectorFactory; + +import java.util.Map; +import java.util.Optional; + +import static io.trino.plugin.hudi.InternalHudiConnectorFactory.createConnector; +import static java.util.Objects.requireNonNull; + +public class TestingHudiConnectorFactory + implements ConnectorFactory +{ + private final Optional metastore; + + public TestingHudiConnectorFactory(Optional metastore) + { + this.metastore = requireNonNull(metastore, "metastore is null"); + } + + @Override + public String getName() + { + return "hudi"; + } + + @Override + public Connector create(String catalogName, Map config, ConnectorContext context) + { + return createConnector(catalogName, config, context, metastore); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestingHudiPlugin.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestingHudiPlugin.java new file mode 100644 index 000000000000..42788856d895 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/TestingHudiPlugin.java @@ -0,0 +1,44 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.spi.connector.ConnectorFactory; + +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.Verify.verify; +import static java.util.Objects.requireNonNull; + +public class TestingHudiPlugin + extends HudiPlugin +{ + private final Optional metastore; + + public TestingHudiPlugin(Optional metastore) + { + this.metastore = requireNonNull(metastore, "metastore is null"); + } + + @Override + public Iterable getConnectorFactories() + { + List connectorFactories = ImmutableList.copyOf(super.getConnectorFactories()); + verify(connectorFactories.size() == 1, "Unexpected connector factories: %s", connectorFactories); + + return ImmutableList.of(new TestingHudiConnectorFactory(metastore)); + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/HudiTablesInitializer.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/HudiTablesInitializer.java new file mode 100644 index 000000000000..37f1cc1bdc08 --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/HudiTablesInitializer.java @@ -0,0 +1,30 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.testing; + +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.spi.connector.CatalogSchemaName; +import io.trino.testing.QueryRunner; +import org.apache.hadoop.conf.Configuration; + +public interface HudiTablesInitializer +{ + void initializeTables( + QueryRunner queryRunner, + HiveMetastore metastore, + CatalogSchemaName hudiCatalogSchema, + String dataDir, + Configuration conf) + throws Exception; +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/ResourceHudiTablesInitializer.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/ResourceHudiTablesInitializer.java new file mode 100644 index 000000000000..3ff2eb83501f --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/ResourceHudiTablesInitializer.java @@ -0,0 +1,279 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.testing; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.log.Logger; +import io.trino.plugin.hive.HiveStorageFormat; +import io.trino.plugin.hive.HiveType; +import io.trino.plugin.hive.PartitionStatistics; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.PartitionWithStatistics; +import io.trino.plugin.hive.metastore.PrincipalPrivileges; +import io.trino.plugin.hive.metastore.StorageFormat; +import io.trino.plugin.hive.metastore.Table; +import io.trino.spi.connector.CatalogSchemaName; +import io.trino.testing.QueryRunner; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hudi.common.model.HoodieTableType; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static io.trino.plugin.hive.HivePartitionManager.extractPartitionValues; +import static io.trino.plugin.hive.HiveType.HIVE_DOUBLE; +import static io.trino.plugin.hive.HiveType.HIVE_INT; +import static io.trino.plugin.hive.HiveType.HIVE_LONG; +import static io.trino.plugin.hive.HiveType.HIVE_STRING; +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; + +public class ResourceHudiTablesInitializer + implements HudiTablesInitializer +{ + public ResourceHudiTablesInitializer() {} + + @Override + public void initializeTables( + QueryRunner queryRunner, + HiveMetastore metastore, + CatalogSchemaName hudiCatalogSchema, + String dataDir, + Configuration conf) + throws Exception + { + Path basePath = Path.of(dataDir); + copyDir(Paths.get("src/test/resources/hudi-testing-data"), basePath); + Logger.get(getClass()).info("Prepared table data in %s", basePath); + + for (TestingTable table : TestingTable.values()) { + String tableName = table.getTableName(); + createTable( + metastore, + hudiCatalogSchema, + basePath.resolve(tableName), + tableName, + table.getDataColumns(), + table.getPartitionColumns(), + table.getPartitions()); + } + } + + private void createTable( + HiveMetastore metastore, + CatalogSchemaName hudiCatalogSchema, + Path tablePath, + String tableName, + List dataColumns, + List partitionColumns, + Map partitions) + { + StorageFormat storageFormat = StorageFormat.fromHiveStorageFormat(HiveStorageFormat.PARQUET); + + Table table = Table.builder() + .setDatabaseName(hudiCatalogSchema.getSchemaName()) + .setTableName(tableName) + .setTableType(TableType.EXTERNAL_TABLE.name()) + .setOwner(Optional.of("public")) + .setDataColumns(dataColumns) + .setPartitionColumns(partitionColumns) + .setParameters(ImmutableMap.of("serialization.format", "1", "EXTERNAL", "TRUE")) + .withStorage(storageBuilder -> storageBuilder + .setStorageFormat(storageFormat) + .setLocation("file://" + tablePath)) + .build(); + metastore.createTable(table, PrincipalPrivileges.NO_PRIVILEGES); + + List partitionsToAdd = new ArrayList<>(); + partitions.forEach((partitionName, partitionPath) -> { + Partition partition = Partition.builder() + .setDatabaseName(hudiCatalogSchema.getSchemaName()) + .setTableName(tableName) + .setValues(extractPartitionValues(partitionName)) + .withStorage(storageBuilder -> storageBuilder + .setStorageFormat(storageFormat) + .setLocation("file://" + tablePath.resolve(partitionPath))) + .setColumns(dataColumns) + .build(); + partitionsToAdd.add(new PartitionWithStatistics(partition, partitionName, PartitionStatistics.empty())); + }); + metastore.addPartitions(hudiCatalogSchema.getSchemaName(), tableName, partitionsToAdd); + } + + private static Column column(String name, HiveType type) + { + return new Column(name, type, Optional.empty()); + } + + private static void copyDir(Path srcDir, Path dstDir) + throws IOException + { + try (Stream paths = Files.walk(srcDir)) { + for (Iterator iterator = paths.iterator(); iterator.hasNext(); ) { + Path path = iterator.next(); + Path relativePath = srcDir.relativize(path); + if (path.toFile().isDirectory()) { + Files.createDirectories(dstDir.resolve(relativePath)); + } + else { + Path dstFile = dstDir.resolve(relativePath); + Files.createDirectories(dstFile.getParent()); + Files.copy(path, dstFile); + } + } + } + } + + public enum TestingTable + { + HUDI_NON_PART_COW(COPY_ON_WRITE, nonPartitionRegularColumns()), + HUDI_COW_PT_TBL(COPY_ON_WRITE, multiPartitionRegularColumns(), multiPartitionColumns(), multiPartitions()), + STOCK_TICKS_COW(COPY_ON_WRITE, stockTicksRegularColumns(), stockTicksPartitionColumns(), stockTicksPartitions()), + STOCK_TICKS_MOR(MERGE_ON_READ, stockTicksRegularColumns(), stockTicksPartitionColumns(), stockTicksPartitions()), + /**/; + + private static final List HUDI_META_COLUMNS = ImmutableList.of( + new Column("_hoodie_commit_time", HIVE_STRING, Optional.empty()), + new Column("_hoodie_commit_seqno", HIVE_STRING, Optional.empty()), + new Column("_hoodie_record_key", HIVE_STRING, Optional.empty()), + new Column("_hoodie_partition_path", HIVE_STRING, Optional.empty()), + new Column("_hoodie_file_name", HIVE_STRING, Optional.empty())); + + private final HoodieTableType tableType; + private final List regularColumns; + private final List partitionColumns; + private final Map partitions; + + TestingTable( + HoodieTableType tableType, + List regularColumns, + List partitionColumns, + Map partitions) + { + this.tableType = tableType; + this.regularColumns = regularColumns; + this.partitionColumns = partitionColumns; + this.partitions = partitions; + } + + TestingTable(HoodieTableType tableType, List regularColumns) + { + this(tableType, regularColumns, ImmutableList.of(), ImmutableMap.of()); + } + + public String getTableName() + { + return name().toLowerCase(Locale.ROOT); + } + + public HoodieTableType getTableType() + { + return tableType; + } + + public List getDataColumns() + { + return Stream.of(HUDI_META_COLUMNS, regularColumns) + .flatMap(Collection::stream) + .collect(Collectors.toUnmodifiableList()); + } + + public List getPartitionColumns() + { + return partitionColumns; + } + + public Map getPartitions() + { + return partitions; + } + + private static List nonPartitionRegularColumns() + { + return ImmutableList.of( + column("rowid", HIVE_STRING), + column("partitionid", HIVE_STRING), + column("precomb", HIVE_LONG), + column("name", HIVE_STRING), + column("versionid", HIVE_STRING), + column("tobedeletedstr", HIVE_STRING), + column("inttolong", HIVE_INT), + column("longtoint", HIVE_LONG)); + } + + private static List stockTicksRegularColumns() + { + return ImmutableList.of( + column("volume", HIVE_LONG), + column("ts", HIVE_STRING), + column("symbol", HIVE_STRING), + column("year", HIVE_INT), + column("month", HIVE_STRING), + column("high", HIVE_DOUBLE), + column("low", HIVE_DOUBLE), + column("key", HIVE_STRING), + column("date", HIVE_STRING), + column("close", HIVE_DOUBLE), + column("open", HIVE_DOUBLE), + column("day", HIVE_STRING)); + } + + private static List stockTicksPartitionColumns() + { + return ImmutableList.of(column("dt", HIVE_STRING)); + } + + private static Map stockTicksPartitions() + { + return ImmutableMap.of("dt=2018-08-31", "2018/08/31"); + } + + private static List multiPartitionRegularColumns() + { + return ImmutableList.of( + column("id", HIVE_LONG), + column("name", HIVE_STRING), + column("ts", HIVE_LONG)); + } + + private static List multiPartitionColumns() + { + return ImmutableList.of( + column("dt", HIVE_STRING), + column("hh", HIVE_STRING)); + } + + private static Map multiPartitions() + { + return ImmutableMap.of( + "dt=2021-12-09/hh=10", "dt=2021-12-09/hh=10", + "dt=2021-12-09/hh=11", "dt=2021-12-09/hh=11"); + } + } +} diff --git a/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/TpchHudiTablesInitializer.java b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/TpchHudiTablesInitializer.java new file mode 100644 index 000000000000..6fa190dac36e --- /dev/null +++ b/plugin/trino-hudi/src/test/java/io/trino/plugin/hudi/testing/TpchHudiTablesInitializer.java @@ -0,0 +1,377 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.testing; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.log.Logger; +import io.trino.plugin.hive.HiveStorageFormat; +import io.trino.plugin.hive.HiveType; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.StorageFormat; +import io.trino.plugin.hive.metastore.Table; +import io.trino.plugin.tpch.TpchPlugin; +import io.trino.spi.connector.CatalogSchemaName; +import io.trino.testing.MaterializedResult; +import io.trino.testing.MaterializedRow; +import io.trino.testing.QueryRunner; +import io.trino.tpch.TpchColumn; +import io.trino.tpch.TpchColumnType; +import io.trino.tpch.TpchColumnTypes; +import io.trino.tpch.TpchTable; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.intellij.lang.annotations.Language; + +import java.io.IOException; +import java.time.LocalDate; +import java.time.temporal.ChronoField; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verify; +import static io.trino.plugin.hive.HiveType.HIVE_DATE; +import static io.trino.plugin.hive.HiveType.HIVE_DOUBLE; +import static io.trino.plugin.hive.HiveType.HIVE_INT; +import static io.trino.plugin.hive.HiveType.HIVE_LONG; +import static io.trino.plugin.hive.HiveType.HIVE_STRING; +import static io.trino.plugin.hive.metastore.PrincipalPrivileges.NO_PRIVILEGES; +import static java.lang.String.format; +import static java.util.Collections.unmodifiableList; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toUnmodifiableList; +import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE; + +public class TpchHudiTablesInitializer + implements HudiTablesInitializer +{ + public static final String FIELD_UUID = "_uuid"; + private static final CatalogSchemaName TPCH_TINY = new CatalogSchemaName("tpch", "tiny"); + private static final String PARTITION_PATH = ""; + private static final Logger log = Logger.get(TpchHudiTablesInitializer.class); + private static final List HUDI_META_COLUMNS = ImmutableList.of( + new Column("_hoodie_commit_time", HIVE_STRING, Optional.empty()), + new Column("_hoodie_commit_seqno", HIVE_STRING, Optional.empty()), + new Column("_hoodie_record_key", HIVE_STRING, Optional.empty()), + new Column("_hoodie_partition_path", HIVE_STRING, Optional.empty()), + new Column("_hoodie_file_name", HIVE_STRING, Optional.empty())); + + private final HoodieTableType tableType; + private final List> tpchTables; + + public TpchHudiTablesInitializer(HoodieTableType tableType, List> tpchTables) + { + this.tableType = requireNonNull(tableType, "tableType is null"); + this.tpchTables = requireNonNull(tpchTables, "tpchTables is null"); + } + + @Override + public void initializeTables( + QueryRunner queryRunner, + HiveMetastore metastore, + CatalogSchemaName hudiCatalogSchema, + String dataDir, + Configuration conf) + { + queryRunner.installPlugin(new TpchPlugin()); + queryRunner.createCatalog(TPCH_TINY.getCatalogName(), "tpch", ImmutableMap.of()); + for (TpchTable table : tpchTables) { + load(table, queryRunner, metastore, hudiCatalogSchema, dataDir, conf); + } + } + + private void load( + TpchTable table, + QueryRunner queryRunner, + HiveMetastore metastore, + CatalogSchemaName hudiCatalogSchema, + String basePath, + Configuration conf) + { + try (HoodieJavaWriteClient writeClient = createWriteClient(table, basePath, conf)) { + RecordConverter recordConverter = createRecordConverter(table); + + @Language("SQL") String sql = generateScanSql(TPCH_TINY, table); + log.info("Executing %s", sql); + MaterializedResult result = queryRunner.execute(sql); + + List> records = result.getMaterializedRows() + .stream() + .map(MaterializedRow::getFields) + .map(recordConverter::toRecord) + .collect(Collectors.toList()); + String timestamp = "0"; + writeClient.startCommitWithTime(timestamp); + writeClient.insert(records, timestamp); + } + + metastore.createTable(createMetastoreTable(table, hudiCatalogSchema, basePath), NO_PRIVILEGES); + } + + private String generateScanSql(CatalogSchemaName catalogSchemaName, TpchTable table) + { + StringBuilder builder = new StringBuilder(); + builder.append("SELECT "); + String columnList = table.getColumns() + .stream() + .map(c -> quote(c.getSimplifiedColumnName())) + .collect(Collectors.joining(", ")); + builder.append(columnList); + String tableName = format("%s.%s", catalogSchemaName.toString(), table.getTableName()); + builder.append(" FROM ").append(tableName); + return builder.toString(); + } + + private Table createMetastoreTable(TpchTable table, CatalogSchemaName targetCatalogSchema, String basePath) + { + String tablePath = getTablePath(table, basePath); + List columns = Stream.of(HUDI_META_COLUMNS, createMetastoreColumns(table)) + .flatMap(Collection::stream) + .collect(toUnmodifiableList()); + // TODO: create right format + StorageFormat storageFormat = StorageFormat.fromHiveStorageFormat(HiveStorageFormat.PARQUET); + + return Table.builder() + .setDatabaseName(targetCatalogSchema.getSchemaName()) + .setTableName(table.getTableName()) + .setTableType(EXTERNAL_TABLE.name()) + .setOwner(Optional.of("public")) + .setDataColumns(columns) + .setParameters(ImmutableMap.of("serialization.format", "1", "EXTERNAL", "TRUE")) + .withStorage(storageBuilder -> storageBuilder + .setStorageFormat(storageFormat) + .setLocation(tablePath)) + .build(); + } + + private HoodieJavaWriteClient createWriteClient(TpchTable table, String basePath, Configuration conf) + { + String tableName = table.getTableName(); + String tablePath = getTablePath(table, basePath); + Schema schema = createAvroSchema(table); + + try { + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(tableName) + .setBootstrapIndexClass(NoOpBootstrapIndex.class.getName()) + .setPayloadClassName(HoodieAvroPayload.class.getName()) + .setRecordKeyFields(FIELD_UUID) + .initTable(conf, tablePath); + } + catch (IOException e) { + throw new RuntimeException("Could not init table " + tableName, e); + } + + HoodieIndexConfig indexConfig = HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build(); + HoodieCompactionConfig compactionConfig = HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build(); + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder() + .withPath(tablePath) + .withSchema(schema.toString()) + .withParallelism(2, 2) + .withDeleteParallelism(2) + .forTable(tableName) + .withIndexConfig(indexConfig) + .withCompactionConfig(compactionConfig) + .withEmbeddedTimelineServerEnabled(false) + .withMarkersType(MarkerType.DIRECT.name()) + .build(); + return new HoodieJavaWriteClient<>(new HoodieJavaEngineContext(conf), cfg); + } + + private String getTablePath(TpchTable table, String basePath) + { + return basePath + "/" + table.getTableName(); + } + + private static RecordConverter createRecordConverter(TpchTable table) + { + Schema schema = createAvroSchema(table); + List> columns = table.getColumns(); + + int numberOfColumns = columns.size(); + List columnNames = columns.stream() + .map(TpchColumn::getSimplifiedColumnName) + .collect(toUnmodifiableList()); + List> columnConverters = columns.stream() + .map(TpchColumn::getType) + .map(TpchHudiTablesInitializer::avroEncoderOf) + .collect(toUnmodifiableList()); + + return row -> { + checkArgument(row.size() == numberOfColumns); + + // Create a GenericRecord + GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < numberOfColumns; i++) { + record.put(columnNames.get(i), columnConverters.get(i).apply(row.get(i))); + } + // Add extra uuid column + String uuid = UUID.randomUUID().toString(); + record.put(FIELD_UUID, uuid); + + // wrap to a HoodieRecord + HoodieKey key = new HoodieKey(uuid, PARTITION_PATH); + HoodieAvroPayload data = new HoodieAvroPayload(Option.of(record)); + return new HoodieRecord<>(key, data) + { + @Override + public HoodieRecord newInstance() + { + return new HoodieAvroRecord<>(key, data, null); + } + }; + }; + } + + private static Schema createAvroSchema(TpchTable table) + { + List> tpchColumns = table.getColumns(); + List fields = new ArrayList<>(tpchColumns.size() + 1); + for (TpchColumn column : tpchColumns) { + String columnName = column.getSimplifiedColumnName(); + Schema.Type columnSchemaType = toSchemaType(column.getType()); + // Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(type)); + fields.add(new Schema.Field(columnName, Schema.create(columnSchemaType))); + } + fields.add(new Schema.Field(FIELD_UUID, Schema.create(Schema.Type.STRING))); + String name = table.getTableName(); + return Schema.createRecord(name, null, null, false, fields); + } + + private static List createMetastoreColumns(TpchTable table) + { + List> tpchColumns = table.getColumns(); + List columns = new ArrayList<>(tpchColumns.size() + 1); + for (TpchColumn c : tpchColumns) { + HiveType hiveType = TpchColumnTypeAdapter.toHiveType(c.getType()); + columns.add(new Column(c.getSimplifiedColumnName(), hiveType, Optional.empty())); + } + columns.add(new Column(FIELD_UUID, HIVE_STRING, Optional.empty())); + return unmodifiableList(columns); + } + + private static Schema.Type toSchemaType(TpchColumnType columnType) + { + return TpchColumnTypeAdapter.of(columnType).avroType; + } + + private static Function avroEncoderOf(TpchColumnType columnType) + { + return TpchColumnTypeAdapter.of(columnType).avroEncoder; + } + + private static String quote(String name) + { + return "\"" + name + "\""; + } + + private enum TpchColumnTypeAdapter + { + INTEGER(Schema.Type.INT, hiveTypeOf(HIVE_INT), Function.identity()), + IDENTIFIER(Schema.Type.LONG, hiveTypeOf(HIVE_LONG), Function.identity()), + DATE(Schema.Type.INT, hiveTypeOf(HIVE_DATE), TpchColumnTypeAdapter::convertDate), + DOUBLE(Schema.Type.DOUBLE, hiveTypeOf(HIVE_DOUBLE), Function.identity()), + VARCHAR(Schema.Type.STRING, TpchColumnTypeAdapter::hiveVarcharOf, Function.identity()), + /**/; + + static TpchColumnTypeAdapter of(TpchColumnType columnType) + { + if (columnType == TpchColumnTypes.INTEGER) { + return INTEGER; + } + else if (columnType == TpchColumnTypes.IDENTIFIER) { + return IDENTIFIER; + } + else if (columnType == TpchColumnTypes.DATE) { + return DATE; + } + else if (columnType == TpchColumnTypes.DOUBLE) { + return DOUBLE; + } + else { + if (columnType.getBase() != TpchColumnType.Base.VARCHAR || columnType.getPrecision().isEmpty()) { + throw new IllegalArgumentException("Illegal column type: " + columnType); + } + return VARCHAR; + } + } + + static HiveType toHiveType(TpchColumnType columnType) + { + return of(columnType).hiveTypeConverter.apply(columnType); + } + + private final Schema.Type avroType; + private final Function hiveTypeConverter; + private final Function avroEncoder; + + TpchColumnTypeAdapter( + Schema.Type avroType, + Function hiveTypeConverter, + Function avroEncoder) + { + this.avroType = avroType; + this.hiveTypeConverter = hiveTypeConverter; + this.avroEncoder = avroEncoder; + } + + private static Function hiveTypeOf(HiveType hiveType) + { + return ignored -> hiveType; + } + + private static HiveType hiveVarcharOf(TpchColumnType type) + { + verify(type.getPrecision().isPresent()); + return HiveType.valueOf("varchar(" + type.getPrecision().get() + ")"); + } + + private static Object convertDate(Object input) + { + LocalDate date = (LocalDate) input; + return (int) date.getLong(ChronoField.EPOCH_DAY); + } + } + + private interface RecordConverter + { + HoodieRecord toRecord(List row); + } +} diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.commit b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.commit new file mode 100644 index 000000000000..f7993d9cbb86 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.commit @@ -0,0 +1,73 @@ +{ + "partitionToWriteStats" : { + "dt=2021-12-09/hh=10" : [ { + "fileId" : "719c3273-2805-4124-b1ac-e980dada85bf-0", + "path" : "dt=2021-12-09/hh=10/719c3273-2805-4124-b1ac-e980dada85bf-0_0-27-1215_20220906063435640.parquet", + "prevCommit" : "null", + "numWrites" : 1, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 435204, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "dt=2021-12-09/hh=10", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 435204, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"hudi_cow_pt_tbl_record\",\"namespace\":\"hoodie.hudi_cow_pt_tbl\",\"fields\":[{\"name\":\"id\",\"type\":\"long\"},{\"name\":\"name\",\"type\":\"string\"},{\"name\":\"ts\",\"type\":\"long\"},{\"name\":\"dt\",\"type\":\"string\"},{\"name\":\"hh\",\"type\":\"string\"}]}" + }, + "operationType" : "UPSERT", + "writePartitionPaths" : [ "dt=2021-12-09/hh=10" ], + "fileIdAndRelativePaths" : { + "719c3273-2805-4124-b1ac-e980dada85bf-0" : "dt=2021-12-09/hh=10/719c3273-2805-4124-b1ac-e980dada85bf-0_0-27-1215_20220906063435640.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 512, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writeStats" : [ { + "fileId" : "719c3273-2805-4124-b1ac-e980dada85bf-0", + "path" : "dt=2021-12-09/hh=10/719c3273-2805-4124-b1ac-e980dada85bf-0_0-27-1215_20220906063435640.parquet", + "prevCommit" : "null", + "numWrites" : 1, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 435204, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "dt=2021-12-09/hh=10", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 435204, + "minEventTime" : null, + "maxEventTime" : null + } ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.commit.requested b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.commit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.inflight b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.inflight new file mode 100644 index 000000000000..5bb738a80f74 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063435640.inflight @@ -0,0 +1,71 @@ +{ + "partitionToWriteStats" : { + "dt=2021-12-09/hh=10" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "writePartitionPaths" : [ "dt=2021-12-09/hh=10" ], + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writeStats" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.commit b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.commit new file mode 100644 index 000000000000..0b2cacafaf52 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.commit @@ -0,0 +1,73 @@ +{ + "partitionToWriteStats" : { + "dt=2021-12-09/hh=11" : [ { + "fileId" : "4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0", + "path" : "dt=2021-12-09/hh=11/4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0_0-70-2444_20220906063456550.parquet", + "prevCommit" : "null", + "numWrites" : 1, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 435204, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "dt=2021-12-09/hh=11", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 435204, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"hudi_cow_pt_tbl_record\",\"namespace\":\"hoodie.hudi_cow_pt_tbl\",\"fields\":[{\"name\":\"id\",\"type\":\"long\"},{\"name\":\"name\",\"type\":\"string\"},{\"name\":\"ts\",\"type\":\"long\"},{\"name\":\"dt\",\"type\":\"string\"},{\"name\":\"hh\",\"type\":\"string\"}]}" + }, + "operationType" : "UPSERT", + "writePartitionPaths" : [ "dt=2021-12-09/hh=11" ], + "fileIdAndRelativePaths" : { + "4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0" : "dt=2021-12-09/hh=11/4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0_0-70-2444_20220906063456550.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 72, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writeStats" : [ { + "fileId" : "4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0", + "path" : "dt=2021-12-09/hh=11/4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0_0-70-2444_20220906063456550.parquet", + "prevCommit" : "null", + "numWrites" : 1, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 435204, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "dt=2021-12-09/hh=11", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 435204, + "minEventTime" : null, + "maxEventTime" : null + } ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.commit.requested b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.commit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.inflight b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.inflight new file mode 100644 index 000000000000..8ef87d4ed5f6 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/20220906063456550.inflight @@ -0,0 +1,71 @@ +{ + "partitionToWriteStats" : { + "dt=2021-12-09/hh=11" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "writePartitionPaths" : [ "dt=2021-12-09/hh=11" ], + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writeStats" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 1, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/hoodie.properties b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/hoodie.properties new file mode 100644 index 000000000000..4d3a2d67cfc6 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/.hoodie/hoodie.properties @@ -0,0 +1,18 @@ +#Updated at 2022-09-06T06:34:40.904Z +#Tue Sep 06 06:34:40 UTC 2022 +hoodie.table.precombine.field=ts +hoodie.datasource.write.drop.partition.columns=false +hoodie.table.partition.fields=dt,hh +hoodie.table.type=COPY_ON_WRITE +hoodie.archivelog.folder=archived +hoodie.timeline.layout.version=1 +hoodie.table.version=5 +hoodie.table.metadata.partitions=files +hoodie.table.recordkey.fields=id +hoodie.database.name=default +hoodie.datasource.write.partitionpath.urlencode=false +hoodie.table.keygenerator.class=org.apache.hudi.keygen.ComplexKeyGenerator +hoodie.table.name=hudi_cow_pt_tbl +hoodie.datasource.write.hive_style_partitioning=true +hoodie.table.checksum=1395413629 +hoodie.table.create.schema={"type"\:"record","name"\:"hudi_cow_pt_tbl_record","namespace"\:"hoodie.hudi_cow_pt_tbl","fields"\:[{"name"\:"_hoodie_commit_time","type"\:["string","null"]},{"name"\:"_hoodie_commit_seqno","type"\:["string","null"]},{"name"\:"_hoodie_record_key","type"\:["string","null"]},{"name"\:"_hoodie_partition_path","type"\:["string","null"]},{"name"\:"_hoodie_file_name","type"\:["string","null"]},{"name"\:"id","type"\:["long","null"]},{"name"\:"name","type"\:["string","null"]},{"name"\:"ts","type"\:["long","null"]},{"name"\:"dt","type"\:["string","null"]},{"name"\:"hh","type"\:["string","null"]}]} diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=10/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=10/.hoodie_partition_metadata new file mode 100644 index 000000000000..92ce82c886b7 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=10/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Tue Sep 06 06:34:49 UTC 2022 +commitTime=20220906063435640 +partitionDepth=2 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=10/719c3273-2805-4124-b1ac-e980dada85bf-0_0-27-1215_20220906063435640.parquet b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=10/719c3273-2805-4124-b1ac-e980dada85bf-0_0-27-1215_20220906063435640.parquet new file mode 100644 index 000000000000..4785e8c09095 Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=10/719c3273-2805-4124-b1ac-e980dada85bf-0_0-27-1215_20220906063435640.parquet differ diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=11/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=11/.hoodie_partition_metadata new file mode 100644 index 000000000000..ed8c619a1362 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=11/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Tue Sep 06 06:35:03 UTC 2022 +commitTime=20220906063456550 +partitionDepth=2 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=11/4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0_0-70-2444_20220906063456550.parquet b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=11/4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0_0-70-2444_20220906063456550.parquet new file mode 100644 index 000000000000..f2918fae851c Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_cow_pt_tbl/dt=2021-12-09/hh=11/4a3fcb9b-65eb-4f6e-acf9-7b0764bb4dd1-0_0-70-2444_20220906063456550.parquet differ diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.commit b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.commit new file mode 100644 index 000000000000..f77eeb137f02 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.commit @@ -0,0 +1,50 @@ +{ + "partitionToWriteStats" : { + "" : [ { + "fileId" : "d0875d00-483d-4e8b-bbbe-c520366c47a0-0", + "path" : "d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet", + "prevCommit" : "null", + "numWrites" : 3, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 3, + "totalWriteBytes" : 436273, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 436273, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"hudi_non_part_cow_record\",\"namespace\":\"hoodie.hudi_non_part_cow\",\"fields\":[{\"name\":\"rowId\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"partitionId\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"preComb\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"name\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"versionId\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"toBeDeletedStr\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"intToLong\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"longToInt\",\"type\":[\"null\",\"long\"],\"default\":null}]}" + }, + "operationType" : "INSERT", + "writePartitionPaths" : [ "" ], + "fileIdAndRelativePaths" : { + "d0875d00-483d-4e8b-bbbe-c520366c47a0-0" : "d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 1743, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.commit.requested b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.commit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.inflight b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.inflight new file mode 100644 index 000000000000..6605bcaf9b36 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/20211217110514527.inflight @@ -0,0 +1,48 @@ +{ + "partitionToWriteStats" : { + "" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 3, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "INSERT", + "writePartitionPaths" : [ "" ], + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/hoodie.properties b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/hoodie.properties new file mode 100644 index 000000000000..3d03fa7915c3 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie/hoodie.properties @@ -0,0 +1,14 @@ +#Properties saved on Fri Dec 17 11:05:14 UTC 2021 +#Fri Dec 17 11:05:14 UTC 2021 +hoodie.table.precombine.field=preComb +hoodie.table.partition.fields= +hoodie.table.type=COPY_ON_WRITE +hoodie.archivelog.folder=archived +hoodie.populate.meta.fields=true +hoodie.timeline.layout.version=1 +hoodie.table.version=3 +hoodie.table.recordkey.fields=rowId +hoodie.table.base.file.format=PARQUET +hoodie.table.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.table.name=hudi_non_part_cow +hoodie.datasource.write.hive_style_partitioning=false diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie_partition_metadata new file mode 100644 index 000000000000..f2149eb6cd5a --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Fri Dec 17 11:05:23 UTC 2021 +commitTime=20211217110514527 +partitionDepth=0 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet new file mode 100644 index 000000000000..52de8719bf62 Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/hudi-testing-data/hudi_non_part_cow/d0875d00-483d-4e8b-bbbe-c520366c47a0-0_0-6-11_20211217110514527.parquet differ diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.commit b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.commit new file mode 100644 index 000000000000..18cf55cc1bfd --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.commit @@ -0,0 +1,51 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "871677fb-e0e3-46f8-9cc1-fe497e317216-0", + "path" : "2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet", + "prevCommit" : "null", + "numWrites" : 99, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 440747, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "2018/08/31", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 440747, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"stock_ticks\",\"fields\":[{\"name\":\"volume\",\"type\":\"long\"},{\"name\":\"ts\",\"type\":\"string\"},{\"name\":\"symbol\",\"type\":\"string\"},{\"name\":\"year\",\"type\":\"int\"},{\"name\":\"month\",\"type\":\"string\"},{\"name\":\"high\",\"type\":\"double\"},{\"name\":\"low\",\"type\":\"double\"},{\"name\":\"key\",\"type\":\"string\"},{\"name\":\"date\",\"type\":\"string\"},{\"name\":\"close\",\"type\":\"double\"},{\"name\":\"open\",\"type\":\"double\"},{\"name\":\"day\",\"type\":\"string\"}]}", + "deltastreamer.checkpoint.key" : "stock_ticks,0:1668" + }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "871677fb-e0e3-46f8-9cc1-fe497e317216-0" : "2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 750, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.commit.requested b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.commit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.inflight b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.inflight new file mode 100644 index 000000000000..6dc689a285d9 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/20211216071453747.inflight @@ -0,0 +1,48 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/hoodie.properties b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/hoodie.properties new file mode 100644 index 000000000000..4754c1c23eb2 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/.hoodie/hoodie.properties @@ -0,0 +1,13 @@ +#Properties saved on Thu Dec 16 07:14:51 UTC 2021 +#Thu Dec 16 07:14:51 UTC 2021 +hoodie.table.precombine.field=ts +hoodie.table.partition.fields=date +hoodie.table.type=COPY_ON_WRITE +hoodie.archivelog.folder=archived +hoodie.populate.meta.fields=true +hoodie.timeline.layout.version=1 +hoodie.table.version=3 +hoodie.table.recordkey.fields=key +hoodie.table.base.file.format=PARQUET +hoodie.table.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator +hoodie.table.name=stock_ticks_cow diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/2018/08/31/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/2018/08/31/.hoodie_partition_metadata new file mode 100644 index 000000000000..1aaf9e64d933 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/2018/08/31/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Thu Dec 16 07:14:56 UTC 2021 +commitTime=20211216071453747 +partitionDepth=3 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet new file mode 100644 index 000000000000..b97391697e62 Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_cow/2018/08/31/871677fb-e0e3-46f8-9cc1-fe497e317216-0_0-28-26_20211216071453747.parquet differ diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit new file mode 100644 index 000000000000..f9e28873d524 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit @@ -0,0 +1,51 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0", + "path" : "2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet", + "prevCommit" : "null", + "numWrites" : 99, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 440746, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "2018/08/31", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 440746, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"stock_ticks\",\"fields\":[{\"name\":\"volume\",\"type\":\"long\"},{\"name\":\"ts\",\"type\":\"string\"},{\"name\":\"symbol\",\"type\":\"string\"},{\"name\":\"year\",\"type\":\"int\"},{\"name\":\"month\",\"type\":\"string\"},{\"name\":\"high\",\"type\":\"double\"},{\"name\":\"low\",\"type\":\"double\"},{\"name\":\"key\",\"type\":\"string\"},{\"name\":\"date\",\"type\":\"string\"},{\"name\":\"close\",\"type\":\"double\"},{\"name\":\"open\",\"type\":\"double\"},{\"name\":\"day\",\"type\":\"string\"}]}", + "deltastreamer.checkpoint.key" : "stock_ticks,0:1668" + }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "167a0e3e-9b94-444f-a178-242230cdb5a2-0" : "2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet" + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 1402, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.inflight b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.inflight new file mode 100644 index 000000000000..6dc689a285d9 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.inflight @@ -0,0 +1,48 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 99, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "fileIdAndRelativePaths" : { + "" : null + }, + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ] +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.requested b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211221030120532.deltacommit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit new file mode 100644 index 000000000000..f1cc26fecc7b --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit @@ -0,0 +1,55 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0", + "path" : "2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29", + "prevCommit" : "20211221030120532", + "numWrites" : 99, + "numDeletes" : 0, + "numUpdateWrites" : 99, + "numInserts" : 0, + "totalWriteBytes" : 22220, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : "2018/08/31", + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 22220, + "minEventTime" : null, + "maxEventTime" : null, + "logVersion" : 1, + "logOffset" : 0, + "baseFile" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet", + "logFiles" : [ ".167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29" ] + } ] + }, + "compacted" : false, + "extraMetadata" : { + "schema" : "{\"type\":\"record\",\"name\":\"stock_ticks\",\"fields\":[{\"name\":\"volume\",\"type\":\"long\"},{\"name\":\"ts\",\"type\":\"string\"},{\"name\":\"symbol\",\"type\":\"string\"},{\"name\":\"year\",\"type\":\"int\"},{\"name\":\"month\",\"type\":\"string\"},{\"name\":\"high\",\"type\":\"double\"},{\"name\":\"low\",\"type\":\"double\"},{\"name\":\"key\",\"type\":\"string\"},{\"name\":\"date\",\"type\":\"string\"},{\"name\":\"close\",\"type\":\"double\"},{\"name\":\"open\",\"type\":\"double\"},{\"name\":\"day\",\"type\":\"string\"}]}", + "deltastreamer.checkpoint.key" : "stock_ticks,0:3336" + }, + "operationType" : "UPSERT", + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 187, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ], + "fileIdAndRelativePaths" : { + "167a0e3e-9b94-444f-a178-242230cdb5a2-0" : "2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29" + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.inflight b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.inflight new file mode 100644 index 000000000000..724ce56ff0d6 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.inflight @@ -0,0 +1,71 @@ +{ + "partitionToWriteStats" : { + "2018/08/31" : [ { + "fileId" : "", + "path" : null, + "prevCommit" : "null", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 0, + "numInserts" : 0, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + }, { + "fileId" : "167a0e3e-9b94-444f-a178-242230cdb5a2-0", + "path" : null, + "prevCommit" : "20211221030120532", + "numWrites" : 0, + "numDeletes" : 0, + "numUpdateWrites" : 99, + "numInserts" : 0, + "totalWriteBytes" : 0, + "totalWriteErrors" : 0, + "tempPath" : null, + "partitionPath" : null, + "totalLogRecords" : 0, + "totalLogFilesCompacted" : 0, + "totalLogSizeCompacted" : 0, + "totalUpdatedRecordsCompacted" : 0, + "totalLogBlocks" : 0, + "totalCorruptLogBlock" : 0, + "totalRollbackBlocks" : 0, + "fileSizeInBytes" : 0, + "minEventTime" : null, + "maxEventTime" : null + } ] + }, + "compacted" : false, + "extraMetadata" : { }, + "operationType" : "UPSERT", + "totalRecordsDeleted" : 0, + "totalLogRecordsCompacted" : 0, + "totalLogFilesCompacted" : 0, + "totalCompactedRecordsUpdated" : 0, + "totalLogFilesSize" : 0, + "totalScanTime" : 0, + "totalCreateTime" : 0, + "totalUpsertTime" : 0, + "minAndMaxEventTime" : { + "Optional.empty" : { + "val" : null, + "present" : false + } + }, + "writePartitionPaths" : [ "2018/08/31" ], + "fileIdAndRelativePaths" : { + "" : null, + "167a0e3e-9b94-444f-a178-242230cdb5a2-0" : null + } +} \ No newline at end of file diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.requested b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/20211227092838847.deltacommit.requested new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/hoodie.properties b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/hoodie.properties new file mode 100644 index 000000000000..33392aa182f2 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/.hoodie/hoodie.properties @@ -0,0 +1,14 @@ +#Properties saved on Tue Dec 21 03:01:13 UTC 2021 +#Tue Dec 21 03:01:13 UTC 2021 +hoodie.table.precombine.field=ts +hoodie.table.partition.fields=date +hoodie.table.type=MERGE_ON_READ +hoodie.archivelog.folder=archived +hoodie.populate.meta.fields=true +hoodie.compaction.payload.class=org.apache.hudi.common.model.OverwriteWithLatestAvroPayload +hoodie.timeline.layout.version=1 +hoodie.table.version=3 +hoodie.table.recordkey.fields=key +hoodie.table.base.file.format=PARQUET +hoodie.table.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator +hoodie.table.name=stock_ticks_mor diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29 b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29 new file mode 100644 index 000000000000..da3c7bc07ee1 Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/.167a0e3e-9b94-444f-a178-242230cdb5a2-0_20211221030120532.log.1_0-28-29 differ diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/.hoodie_partition_metadata b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/.hoodie_partition_metadata new file mode 100644 index 000000000000..340533d6e680 --- /dev/null +++ b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/.hoodie_partition_metadata @@ -0,0 +1,4 @@ +#partition metadata +#Tue Dec 21 03:01:25 UTC 2021 +commitTime=20211221030120532 +partitionDepth=3 diff --git a/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet new file mode 100644 index 000000000000..9fe2112d09bb Binary files /dev/null and b/plugin/trino-hudi/src/test/resources/hudi-testing-data/stock_ticks_mor/2018/08/31/167a0e3e-9b94-444f-a178-242230cdb5a2-0_0-28-26_20211221030120532.parquet differ diff --git a/pom.xml b/pom.xml index e98c738667ac..7aa397cebc00 100644 --- a/pom.xml +++ b/pom.xml @@ -132,6 +132,7 @@ plugin/trino-hive plugin/trino-hive-hadoop2 plugin/trino-http-event-listener + plugin/trino-hudi plugin/trino-iceberg plugin/trino-jmx plugin/trino-kafka @@ -340,6 +341,12 @@ ${project.version} + + io.trino + trino-hudi + ${project.version} + + io.trino trino-iceberg diff --git a/testing/trino-server-dev/etc/catalog/hudi.properties b/testing/trino-server-dev/etc/catalog/hudi.properties new file mode 100644 index 000000000000..71e05e75227f --- /dev/null +++ b/testing/trino-server-dev/etc/catalog/hudi.properties @@ -0,0 +1,15 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +connector.name=hudi +hive.metastore.uri=thrift://localhost:9083 diff --git a/testing/trino-server-dev/etc/config.properties b/testing/trino-server-dev/etc/config.properties index ac207e125b69..ac96118b9eaa 100644 --- a/testing/trino-server-dev/etc/config.properties +++ b/testing/trino-server-dev/etc/config.properties @@ -38,6 +38,7 @@ plugin.bundles=\ ../../plugin/trino-jmx/pom.xml,\ ../../plugin/trino-raptor-legacy/pom.xml,\ ../../plugin/trino-hive-hadoop2/pom.xml,\ + ../../plugin/trino-hudi/pom.xml,\ ../../plugin/trino-example-http/pom.xml,\ ../../plugin/trino-kafka/pom.xml, \ ../../plugin/trino-tpch/pom.xml, \