Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 50 additions & 52 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
<dep.kafka.version>2.3.1</dep.kafka.version>
<dep.druid.version>0.19.0</dep.druid.version>
<dep.jaxb.version>2.3.1</dep.jaxb.version>
<dep.hudi.version>0.9.0</dep.hudi.version>
<dep.hudi.version>0.10.0</dep.hudi.version>
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs to be called out in the PR summary?

<dep.testcontainers.version>1.15.1</dep.testcontainers.version>
<!--
America/Bahia_Banderas has:
Expand Down Expand Up @@ -1097,109 +1097,107 @@
</exclusions>
</dependency>

<!-- Start dependencies for querying Hudi table-->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<artifactId>hudi-presto-bundle</artifactId>
<version>${dep.hudi.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<groupId>com.esotericsoftware</groupId>
<artifactId>kryo-shaded</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.orc</groupId>
<artifactId>orc-core</artifactId>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.objenesis</groupId>
<artifactId>objenesis</artifactId>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.rocksdb</groupId>
<artifactId>rocksdbjni</artifactId>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.esotericsoftware</groupId>
<artifactId>kryo-shaded</artifactId>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-shaded-server</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr</artifactId>
<version>${dep.hudi.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<groupId>org.apache.htrace</groupId>
<artifactId>htrace-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.orc</groupId>
<artifactId>orc-core</artifactId>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
</exclusion>
<exclusion>
<groupId>org.objenesis</groupId>
<artifactId>objenesis</artifactId>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr-bundle</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<groupId>org.apache.orc</groupId>
<artifactId>orc-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<groupId>org.codehaus.jackson</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
<groupId>org.lz4</groupId>
<artifactId>lz4-java</artifactId>
</exclusion>
<exclusion>
<groupId>org.objenesis</groupId>
<artifactId>objenesis</artifactId>
</exclusion>
<exclusion>
<groupId>org.rocksdb</groupId>
<artifactId>rocksdbjni</artifactId>
</exclusion>
<exclusion>
<groupId>com.esotericsoftware</groupId>
<artifactId>kryo-shaded</artifactId>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- End dependencies for querying Hudi table-->

<dependency>
<groupId>net.sf.opencsv</groupId>
Expand Down
2 changes: 2 additions & 0 deletions presto-geospatial/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@
<ignoredClassPatterns>
<ignoredClassPattern>shaded.parquet.it.unimi.dsi.fastutil.*</ignoredClassPattern>
<ignoredClassPattern>module-info</ignoredClassPattern>
<ignoredClassPattern>org.apache.htrace.*</ignoredClassPattern>
<ignoredClassPattern>org.apache.parquet.avro.*</ignoredClassPattern>
</ignoredClassPatterns>
</configuration>
</plugin>
Expand Down
12 changes: 12 additions & 0 deletions presto-hive-hadoop2/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@
<ignoredClassPatterns>
<ignoredClassPattern>shaded.parquet.it.unimi.dsi.fastutil.*</ignoredClassPattern>
<ignoredClassPattern>module-info</ignoredClassPattern>
<ignoredClassPattern>org.apache.htrace.*</ignoredClassPattern>
<ignoredClassPattern>org.apache.parquet.avro.*</ignoredClassPattern>
</ignoredClassPatterns>
</configuration>
</plugin>
Expand Down Expand Up @@ -197,6 +199,11 @@
<ignoredResourcePattern>parquet.thrift</ignoredResourcePattern>
<ignoredResourcePattern>about.html</ignoredResourcePattern>
</ignoredResourcePatterns>
<ignoredClassPatterns>
<ignoredClassPattern>module-info</ignoredClassPattern>
<ignoredClassPattern>org.apache.htrace.*</ignoredClassPattern>
<ignoredClassPattern>org.apache.parquet.avro.*</ignoredClassPattern>
</ignoredClassPatterns>
</configuration>
</plugin>
</plugins>
Expand Down Expand Up @@ -224,6 +231,11 @@
<ignoredResourcePattern>parquet.thrift</ignoredResourcePattern>
<ignoredResourcePattern>about.html</ignoredResourcePattern>
</ignoredResourcePatterns>
<ignoredClassPatterns>
<ignoredClassPattern>module-info</ignoredClassPattern>
<ignoredClassPattern>org.apache.htrace.*</ignoredClassPattern>
<ignoredClassPattern>org.apache.parquet.avro.*</ignoredClassPattern>
</ignoredClassPatterns>
</configuration>
</plugin>
</plugins>
Expand Down
10 changes: 4 additions & 6 deletions presto-hive/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,7 @@

<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
</dependency>

<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr</artifactId>
<artifactId>hudi-presto-bundle</artifactId>
</dependency>

<dependency>
Expand Down Expand Up @@ -411,6 +406,9 @@
<ignoredClassPatterns>
<ignoredClassPattern>shaded.parquet.it.unimi.dsi.fastutil.*</ignoredClassPattern>
<ignoredClassPattern>module-info</ignoredClassPattern>
<!-- TODO: Remove this once shaded in hudi-presto-bundle -->
<ignoredClassPattern>org.apache.htrace.*</ignoredClassPattern>
<ignoredClassPattern>org.apache.parquet.avro.*</ignoredClassPattern>
</ignoredClassPatterns>
</configuration>
</plugin>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public Iterator<HiveFileInfo> list(
p -> new HadoopFileInfoIterator(fileSystem.listLocatedStatus(p)),
namenodeStats,
hiveDirectoryContext.getNestedDirectoryPolicy(),
pathFilter);
Optional.of(pathFilter));
}

public static class HadoopFileInfoIterator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ public class HiveClientConfig

private boolean verboseRuntimeStatsEnabled;
private boolean useRecordPageSourceForCustomSplit = true;
private boolean hudiMetadataEnabled;

private boolean sizeBasedSplitWeightsEnabled = true;
private double minimumAssignedSplitWeight = 0.05;
Expand Down Expand Up @@ -1740,4 +1741,17 @@ public HiveClientConfig setUseRecordPageSourceForCustomSplit(boolean useRecordPa
this.useRecordPageSourceForCustomSplit = useRecordPageSourceForCustomSplit;
return this;
}

@Config("hive.hudi-metadata-enabled")
@ConfigDescription("For Hudi tables prefer to fetch the list of file names, sizes and other metadata from the internal metadata table rather than storage")
public HiveClientConfig setHudiMetadataEnabled(boolean hudiMetadataEnabled)
{
this.hudiMetadataEnabled = hudiMetadataEnabled;
return this;
}

public boolean isHudiMetadataEnabled()
{
return this.hudiMetadataEnabled;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ public final class HiveSessionProperties
public static final String SIZE_BASED_SPLIT_WEIGHTS_ENABLED = "size_based_split_weights_enabled";
public static final String MINIMUM_ASSIGNED_SPLIT_WEIGHT = "minimum_assigned_split_weight";
private static final String USE_RECORD_PAGE_SOURCE_FOR_CUSTOM_SPLIT = "use_record_page_source_for_custom_split";
private static final String HUDI_METADATA_ENABLED = "hudi_metadata_enabled";

private final List<PropertyMetadata<?>> sessionProperties;

Expand Down Expand Up @@ -651,6 +652,11 @@ public HiveSessionProperties(HiveClientConfig hiveClientConfig, OrcFileWriterCon
USE_RECORD_PAGE_SOURCE_FOR_CUSTOM_SPLIT,
"Use record page source for custom split",
hiveClientConfig.isUseRecordPageSourceForCustomSplit(),
false),
booleanProperty(
HUDI_METADATA_ENABLED,
"For Hudi tables prefer to fetch the list of file names, sizes and other metadata from the internal metadata table rather than storage",
hiveClientConfig.isHudiMetadataEnabled(),
false));
}

Expand Down Expand Up @@ -1126,4 +1132,9 @@ public static boolean isUseRecordPageSourceForCustomSplit(ConnectorSession sessi
{
return session.getProperty(USE_RECORD_PAGE_SOURCE_FOR_CUSTOM_SPLIT, Boolean.class);
}

public static boolean isHudiMetadataEnabled(ConnectorSession session)
{
return session.getProperty(HUDI_METADATA_ENABLED, Boolean.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -384,29 +384,33 @@ static boolean shouldUseRecordReaderFromInputFormat(Configuration configuration,
.anyMatch(USE_RECORD_READER_FROM_INPUT_FORMAT_ANNOTATION::equals);
}

static boolean shouldUseFileSplitsFromInputFormat(InputFormat<?, ?> inputFormat, Configuration conf, String tablePath)
static boolean shouldUseFileSplitsFromInputFormat(InputFormat<?, ?> inputFormat, DirectoryLister directoryLister)
{
boolean hasUseSplitsAnnotation = Arrays.stream(inputFormat.getClass().getAnnotations())
.map(Annotation::annotationType)
.map(Class::getSimpleName)
.anyMatch(USE_FILE_SPLITS_FROM_INPUT_FORMAT_ANNOTATION::equals);
if (directoryLister instanceof HudiDirectoryLister) {
boolean hasUseSplitsAnnotation = Arrays.stream(inputFormat.getClass().getAnnotations())
.map(Annotation::annotationType)
.map(Class::getSimpleName)
.anyMatch(USE_FILE_SPLITS_FROM_INPUT_FORMAT_ANNOTATION::equals);

return hasUseSplitsAnnotation &&
(!isHudiParquetInputFormat(inputFormat) || shouldUseFileSplitsForHudi(inputFormat, ((HudiDirectoryLister) directoryLister).getMetaClient()));
}

return hasUseSplitsAnnotation && (!isHudiParquetInputFormat(inputFormat) || shouldUseFileSplitsForHudi(inputFormat, conf, tablePath));
return false;
}

static boolean isHudiParquetInputFormat(InputFormat<?, ?> inputFormat)
{
return inputFormat instanceof HoodieParquetInputFormat;
}

private static boolean shouldUseFileSplitsForHudi(InputFormat<?, ?> inputFormat, Configuration conf, String tablePath)
private static boolean shouldUseFileSplitsForHudi(InputFormat<?, ?> inputFormat, HoodieTableMetaClient metaClient)
{
if (inputFormat instanceof HoodieParquetRealtimeInputFormat) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is n't this check same as !isHudiParquetInputFormat(inputFormat) above? any chances to simplify?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HoodieParquetRealtimeInputFormat is subclass of HoodieParquetInputFormat. However, we want to use file splits from the former but not latter. Hence, a separate check.

return true;
}

HoodieTableMetaClient hoodieTableMetaClient = HoodieTableMetaClient.builder().setConf(conf).setBasePath(tablePath).build();
return hoodieTableMetaClient.getTableConfig().getBootstrapBasePath().isPresent();
return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
}

public static long parseHiveDate(String value)
Expand Down
Loading