Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
51ee500
[HUDI-2973] RFC-27: Data skipping index to improve query performance …
manojpec Mar 3, 2022
876a891
[HUDI-3544] Fixing "populate meta fields" update to metadata table (#…
nsivabalan Mar 3, 2022
a4ba0ff
[HUDI-3552] Strength the NetworkUtils#getHostname by checking network…
danny0405 Mar 3, 2022
be9a264
[HUDI-3548] Fix if user specify key "hoodie.datasource.clustering.asy…
Mar 4, 2022
62f534d
[HUDI-3445] Support Clustering Command Based on Call Procedure Comman…
huberylee Mar 4, 2022
6faed3d
[HUDI-3161][RFC-47] Add Call Produce Command for Spark SQL (#4607)
XuQianJin-Stars Mar 4, 2022
f449807
[MINOR] fix UTC timezone config (#4950)
YuweiXiao Mar 4, 2022
b4362fa
[HUDI-3348] Add UT to verify HoodieRealtimeFileSplit serde (#4951)
xushiyan Mar 4, 2022
0986d5a
[HUDI-3460] Add reader merge memory option for flink (#4911)
cuibo01 Mar 4, 2022
6a46130
[HUDI-2761] Fixing timeline server for repeated refreshes (#4812)
nsivabalan Mar 5, 2022
051ad0b
[HUDI-3130] Fixing Hive getSchema for RT tables addressing different …
aditiwari01 Mar 6, 2022
4b47177
[HUDI-3520] Introduce DeleteSupportSchemaPostProcessor to support add…
wangxianghu Mar 6, 2022
c9ffdc4
[HUDI-3525] Introduce JsonkafkaSourceProcessor to support data prepro…
wangxianghu Mar 6, 2022
6f57bbf
[HUDI-3069] Improve HoodieMergedLogRecordScanner avoid putting unnece…
scxwhite Mar 7, 2022
3539578
[HUDI-3213] Making commit preserve metadata to true for compaction (#…
nsivabalan Mar 7, 2022
f0bcee3
[HUDI-3561] Avoid including whole `MultipleSparkJobExecutionStrategy`…
Mar 7, 2022
a66fd40
[HUDI-3365] Make sure Metadata Table records are updated appropriatel…
Mar 7, 2022
53826d6
[HUDI-2747] support set --sparkMaster for MDT cli (#4964)
zhangyue19921010 Mar 7, 2022
2904076
[HUDI-3576] Configuring timeline refreshes based on latest commit (#4…
nsivabalan Mar 7, 2022
34bc752
[HUDI-3573] flink cleanFuntion execute clean on initialization (#4936)
todd5167 Mar 8, 2022
b6bdb46
[MINOR][HUDI-3460]Fix HoodieDataSourceITCase
cuibo01 Mar 6, 2022
fe53bd2
[HUDI-2677] Add DFS based message queue for flink writer[part3] (#4961)
danny0405 Mar 8, 2022
2538580
[HUDI-3574] Improve maven module configs for different spark profiles…
XuQianJin-Stars Mar 8, 2022
ed26c52
[HUDI-3584] Skip integ test modules by default (#4986)
xushiyan Mar 8, 2022
575bc63
[HUDI-3356][HUDI-3203] HoodieData for metadata index records; BloomFi…
codope Mar 8, 2022
08fd80c
[HUDI-3221] Support querying a table as of a savepoint (#4720)
XuQianJin-Stars Mar 8, 2022
4324e87
[HUDI-3587] Making SupportsUpgradeDowngrade serializable (#4991)
nsivabalan Mar 9, 2022
548000b
[HUDI-3568] Introduce ChainedSchemaPostProcessor to support setting m…
wangxianghu Mar 9, 2022
8859b48
[HUDI-3383] Sync column comments while syncing a hive table (#4960)
MrSleeping123 Mar 10, 2022
ca0b8fc
[MINOR] Add IT CI Test timeout option (#5003)
XuQianJin-Stars Mar 10, 2022
034adda
[HUDI-3396] Make sure `BaseFileOnlyViewRelation` only reads projected…
Mar 10, 2022
ec24407
[HUDI-3581] Reorganize some clazz for hudi flink (#4983)
danny0405 Mar 10, 2022
4e09545
[HUDI-3602][DOCS] Update docker README to build multi-arch images usi…
codope Mar 10, 2022
fa5e750
[HUDI-3586] Add Trino Queries in integration tests (#4988)
yihua Mar 11, 2022
9dc6df5
[HUDI-3595] Fixing NULL schema provider for empty batch (#5002)
nsivabalan Mar 11, 2022
83cff3a
[HUDI-3522] Introduce DropColumnSchemaPostProcessor to support drop c…
wangxianghu Mar 11, 2022
18cdad9
[HUDI-2999] [RFC-42] RFC for consistent hashing index (#4326)
YuweiXiao Mar 11, 2022
faed699
[HUDI-3566] Add thread factory in BoundedInMemoryExecutor (#4926)
scxwhite Mar 11, 2022
b001803
[HUDI-3575] Use HoodieTestDataGenerator#TRIP_SCHEMA as example schema…
wangxianghu Mar 11, 2022
56cb494
[HUDI-3567] Refactor HoodieCommonUtils to make code more reasonable (…
huberylee Mar 11, 2022
5d59bf6
[HUDI-3513] Make sure Column Stats does not fail in case it fails to …
Mar 11, 2022
93277b2
[HUDI-3592] Fix NPE of DefaultHoodieRecordPayload if Property is empt…
Mar 11, 2022
e8918b6
[HUDI-3569] Introduce ChainedJsonKafkaSourePostProcessor to support s…
wangxianghu Mar 11, 2022
e7bb041
[HUDI-3556] Re-use rollback instant for rolling back of clustering an…
nsivabalan Mar 11, 2022
eee96e9
[HUDI-3593] Restore TypedProperties and flush checksum in table confi…
codope Mar 13, 2022
e60acc1
[HUDI-3583] Fix MarkerBasedRollbackStrategy NoSuchElementException (#…
liujinhui1994 Mar 13, 2022
6c8224c
[HUDI-3501] Support savepoints command based on Call Produce Command …
XuQianJin-Stars Mar 13, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,12 @@ jobs:
include:
- scala: "scala-2.11"
spark: "spark2"
skipModules: ""
- scala: "scala-2.11"
spark: "spark2,spark-shade-unbundle-avro"
skipModules: ""
- scala: "scala-2.12"
spark: "spark3.1.x"
skipModules: "!hudi-spark-datasource/hudi-spark3"
- scala: "scala-2.12"
spark: "spark3.1.x,spark-shade-unbundle-avro"
skipModules: "!hudi-spark-datasource/hudi-spark3"
- scala: "scala-2.12"
spark: "spark3"
- scala: "scala-2.12"
Expand All @@ -44,5 +40,4 @@ jobs:
env:
SCALA_PROFILE: ${{ matrix.scala }}
SPARK_PROFILE: ${{ matrix.spark }}
SKIP_MODULES: ${{ matrix.skipModules }}
run: mvn install -P "$SCALA_PROFILE,$SPARK_PROFILE" -pl "$SKIP_MODULES" -DskipTests=true -Dmaven.javadoc.skip=true -B -V
run: mvn install -P "$SCALA_PROFILE,$SPARK_PROFILE" -DskipTests=true -Dmaven.javadoc.skip=true -B -V
2 changes: 2 additions & 0 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ stages:
jdkVersionOption: '1.8'
mavenOptions: '-Xmx2g $(MAVEN_OPTS)'
- job: IT
displayName: IT modules
timeoutInMinutes: '90'
steps:
- task: AzureCLI@2
displayName: Prepare for IT
Expand Down
96 changes: 95 additions & 1 deletion docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,98 @@ You can find more information on [Docker Hub Repositories Manual](https://docs.d

## Docker Demo Setup

Please refer to the [Docker Demo Docs page](https://hudi.apache.org/docs/docker_demo).
Please refer to the [Docker Demo Docs page](https://hudi.apache.org/docs/docker_demo).

## Building Multi-Arch Images

NOTE: The steps below require some code changes. Support for multi-arch builds in a fully automated manner is being
tracked by [HUDI-3601](https://issues.apache.org/jira/browse/HUDI-3601).

By default, the docker images are built for x86_64 (amd64) architecture. Docker `buildx` allows you to build multi-arch
images, link them together with a manifest file, and push them all to a registry – with a single command. Let's say we
want to build for arm64 architecture. First we need to ensure that `buildx` setup is done locally. Please follow the
below steps (referred from https://www.docker.com/blog/multi-arch-images):

```
# List builders
~ ❯❯❯ docker buildx ls
NAME/NODE DRIVER/ENDPOINT STATUS PLATFORMS
default * docker
default default running linux/amd64, linux/arm64, linux/arm/v7, linux/arm/v6

# If you are using the default builder, which is basically the old builder, then do following
~ ❯❯❯ docker buildx create --name mybuilder
mybuilder
~ ❯❯❯ docker buildx use mybuilder
~ ❯❯❯ docker buildx inspect --bootstrap
[+] Building 2.5s (1/1) FINISHED
=> [internal] booting buildkit 2.5s
=> => pulling image moby/buildkit:master 1.3s
=> => creating container buildx_buildkit_mybuilder0 1.2s
Name: mybuilder
Driver: docker-container

Nodes:
Name: mybuilder0
Endpoint: unix:///var/run/docker.sock
Status: running

Platforms: linux/amd64, linux/arm64, linux/arm/v7, linux/arm/v6
```

Now goto `<HUDI_REPO_DIR>/docker/hoodie/hadoop` and change the `Dockerfile` to pull dependent images corresponding to
arm64. For example, in [base/Dockerfile](./hoodie/hadoop/base/Dockerfile) (which pulls jdk8 image), change the
line `FROM openjdk:8u212-jdk-slim-stretch` to `FROM arm64v8/openjdk:8u212-jdk-slim-stretch`.

Then, from under `<HUDI_REPO_DIR>/docker/hoodie/hadoop` directory, execute the following command to build as well as
push the image to the dockerhub repo:

```
# Run under hoodie/hadoop, the <tag> is optional, "latest" by default
docker buildx build <image_folder_name> --platform <comma-separated,platforms> -t <hub-user>/<repo-name>[:<tag>] --push

# For example, to build base image
docker buildx build base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-base:linux-arm64-0.10.1 --push
```

Once the base image is pushed then you could do something similar for other images.
Change [hive](./hoodie/hadoop/hive_base/Dockerfile) dockerfile to pull the base image with tag corresponding to
linux/arm64 platform.

```
# Change below line in the Dockerfile
FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-base:latest
# as shown below
FROM --platform=linux/arm64 apachehudi/hudi-hadoop_${HADOOP_VERSION}-base:linux-arm64-0.10.1

# and then build & push from under hoodie/hadoop dir
docker buildx build hive_base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:linux-arm64-0.10.1 --push
```

Similarly, for images that are dependent on hive (e.g. [base spark](./hoodie/hadoop/spark_base/Dockerfile)
, [sparkmaster](./hoodie/hadoop/sparkmaster/Dockerfile), [sparkworker](./hoodie/hadoop/sparkworker/Dockerfile)
and [sparkadhoc](./hoodie/hadoop/sparkadhoc/Dockerfile)), change the corresponding Dockerfile to pull the base hive
image with tag corresponding to arm64. Then build and push using `docker buildx` command.

For the sake of completeness, here is a [patch](https://gist.github.com/xushiyan/cec16585e884cf0693250631a1d10ec2) which
shows what changes to make in Dockerfiles (assuming tag is named `linux-arm64-0.10.1`), and below is the list
of `docker buildx` commands.

```
docker buildx build base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-base:linux-arm64-0.10.1 --push
docker buildx build datanode --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-datanode:linux-arm64-0.10.1 --push
docker buildx build historyserver --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-history:linux-arm64-0.10.1 --push
docker buildx build hive_base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:linux-arm64-0.10.1 --push
docker buildx build namenode --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-namenode:linux-arm64-0.10.1 --push
docker buildx build prestobase --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-prestobase_0.217:linux-arm64-0.10.1 --push
docker buildx build spark_base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkbase_2.4.4:linux-arm64-0.10.1 --push
docker buildx build sparkadhoc --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1 --push
docker buildx build sparkmaster --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:linux-arm64-0.10.1 --push
docker buildx build sparkworker --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:linux-arm64-0.10.1 --push
```

Once all the required images are pushed to the dockerhub repos, then we need to do one additional change
in [docker compose](./compose/docker-compose_hadoop284_hive233_spark244.yml) file.
Apply [this patch](https://gist.github.com/codope/3dd986de5e54f0650dd74b6032e4456c) to the docker compose file so
that [setup_demo](./setup_demo.sh) pulls images with the correct tag for arm64. And now we should be ready to run the
setup script and follow the docker demo.
23 changes: 23 additions & 0 deletions docker/demo/trino-batch1.commands
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';
21 changes: 21 additions & 0 deletions docker/demo/trino-batch2-after-compaction.commands
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';
20 changes: 20 additions & 0 deletions docker/demo/trino-table-check.commands
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

show tables;
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ public String set(@CliOption(key = {"metadataDir"},
}

@CliCommand(value = "metadata create", help = "Create the Metadata Table if it does not exist")
public String create() throws IOException {
public String create(
@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master
) throws IOException {
HoodieCLI.getTableMetaClient();
Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
try {
Expand All @@ -123,7 +125,7 @@ public String create() throws IOException {

HoodieTimer timer = new HoodieTimer().startTimer();
HoodieWriteConfig writeConfig = getWriteConfig();
initJavaSparkContext();
initJavaSparkContext(Option.of(master));
SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc));
return String.format("Created Metadata Table in %s (duration=%.2f secs)", metadataPath, timer.endTimer() / 1000.0);
}
Expand All @@ -145,7 +147,8 @@ public String delete() throws Exception {
}

@CliCommand(value = "metadata init", help = "Update the metadata table from commits since the creation")
public String init(@CliOption(key = {"readonly"}, unspecifiedDefaultValue = "false",
public String init(@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
@CliOption(key = {"readonly"}, unspecifiedDefaultValue = "false",
help = "Open in read-only mode") final boolean readOnly) throws Exception {
HoodieCLI.getTableMetaClient();
Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
Expand All @@ -159,7 +162,7 @@ public String init(@CliOption(key = {"readonly"}, unspecifiedDefaultValue = "fal
HoodieTimer timer = new HoodieTimer().startTimer();
if (!readOnly) {
HoodieWriteConfig writeConfig = getWriteConfig();
initJavaSparkContext();
initJavaSparkContext(Option.of(master));
SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc));
}

Expand Down Expand Up @@ -191,9 +194,11 @@ public String stats() throws IOException {
}

@CliCommand(value = "metadata list-partitions", help = "List all partitions from metadata")
public String listPartitions() throws IOException {
public String listPartitions(
@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master
) throws IOException {
HoodieCLI.getTableMetaClient();
initJavaSparkContext();
initJavaSparkContext(Option.of(master));
HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieSparkEngineContext(jsc), config,
HoodieCLI.basePath, "/tmp");
Expand Down Expand Up @@ -357,9 +362,9 @@ private HoodieWriteConfig getWriteConfig() {
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build();
}

private void initJavaSparkContext() {
private void initJavaSparkContext(Option<String> userDefinedMaster) {
if (jsc == null) {
jsc = SparkUtil.initJavaSparkConf(SparkUtil.getDefaultConf("HoodieCLI", Option.empty()));
jsc = SparkUtil.initJavaSparkConf(SparkUtil.getDefaultConf("HoodieCLI", userDefinedMaster));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,11 @@
import org.apache.hudi.cli.HoodieTableHeaderFields;
import org.apache.hudi.cli.utils.InputStreamConsumer;
import org.apache.hudi.cli.utils.SparkUtil;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.index.HoodieIndex;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliCommand;
Expand Down Expand Up @@ -162,13 +153,4 @@ public String deleteSavepoint(@CliOption(key = {"commit"}, help = "Delete a save
}
return String.format("Savepoint \"%s\" deleted.", instantTime);
}

private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER).build())
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
return new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ private void shutdownCallback(Function<Boolean, Boolean> callback) {
if (null != callback) {
callback.apply(null != error);
}
this.started = false;
});
}

Expand Down
Loading