Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ public void preWrite(String instantTime, WriteOperationType writeOperationType,
* @param hoodieTable Hoodie Table
* @return Write Status
*/
protected abstract O postWrite(HoodieWriteMetadata<O> result, String instantTime, HoodieTable hoodieTable);
public abstract O postWrite(HoodieWriteMetadata<O> result, String instantTime, HoodieTable hoodieTable);

/**
* Post Commit Hook. Derived classes use this method to perform post-commit processing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.model.WriteOperationType;

/**
* Configs/params used for internal purposes.
Expand All @@ -37,6 +38,14 @@ public class HoodieInternalConfig extends HoodieConfig {
.markAdvanced()
.withDocumentation("Schema set for row writer/bulk insert.");

public static final ConfigProperty<String> BULKINSERT_OVERWRITE_OPERATION_TYPE = ConfigProperty
.key("hoodie.bulkinsert.overwrite.operation.type")
.noDefaultValue()
Comment thread
boneanxs marked this conversation as resolved.
.markAdvanced()
.withValidValues(WriteOperationType.INSERT_OVERWRITE_TABLE.value(), WriteOperationType.INSERT_OVERWRITE.value())
.withDocumentation("For SQL operations, if enables bulk_insert operation, "
+ "this configure will take effect to decide overwrite whole table or partitions specified");

/**
* Returns if partition records are sorted or not.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -342,9 +342,9 @@ public void waitForCleaningFinish() {
}

@Override
protected List<WriteStatus> postWrite(HoodieWriteMetadata<List<WriteStatus>> result,
String instantTime,
HoodieTable hoodieTable) {
public List<WriteStatus> postWrite(HoodieWriteMetadata<List<WriteStatus>> result,
String instantTime,
HoodieTable hoodieTable) {
if (result.getIndexLookupDuration().isPresent()) {
metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ public List<WriteStatus> delete(List<HoodieKey> keys,
}

@Override
protected List<WriteStatus> postWrite(HoodieWriteMetadata<List<WriteStatus>> result,
String instantTime,
HoodieTable hoodieTable) {
public List<WriteStatus> postWrite(HoodieWriteMetadata<List<WriteStatus>> result,
String instantTime,
HoodieTable hoodieTable) {
if (result.getIndexLookupDuration().isPresent()) {
metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,9 @@ public HoodieWriteResult deletePartitions(List<String> partitions, String instan
}

@Override
protected JavaRDD<WriteStatus> postWrite(HoodieWriteMetadata<JavaRDD<WriteStatus>> result,
String instantTime,
HoodieTable hoodieTable) {
public JavaRDD<WriteStatus> postWrite(HoodieWriteMetadata<JavaRDD<WriteStatus>> result,
String instantTime,
HoodieTable hoodieTable) {
if (result.getIndexLookupDuration().isPresent()) {
metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieClusteringException;
import org.apache.hudi.io.SingleFileHandleCreateFactory;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.commit.SparkBulkInsertHelper;

Expand Down Expand Up @@ -77,8 +78,11 @@ public HoodieData<WriteStatus> performClusteringWithRecordsAsRow(Dataset<Row> in
// Since clustering will write to single file group using HoodieUnboundedCreateHandle, set max file size to a large value.
newConfig.setValue(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE, String.valueOf(Long.MAX_VALUE));

return HoodieDatasetBulkInsertHelper.bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig,
getRowPartitioner(strategyParams, schema), numOutputGroups, shouldPreserveHoodieMetadata);
BulkInsertPartitioner<Dataset<Row>> partitioner = getRowPartitioner(strategyParams, schema);
Dataset<Row> repartitionedRecords = partitioner.repartitionRecords(inputRecords, numOutputGroups);
Comment thread
codope marked this conversation as resolved.

return HoodieDatasetBulkInsertHelper.bulkInsert(repartitionedRecords, instantTime, getHoodieTable(), newConfig,
partitioner.arePartitionRecordsSorted(), shouldPreserveHoodieMetadata);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.io.CreateHandleFactory;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.commit.SparkBulkInsertHelper;

Expand Down Expand Up @@ -69,8 +70,11 @@ public HoodieData<WriteStatus> performClusteringWithRecordsAsRow(Dataset<Row> in

newConfig.setValue(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE, String.valueOf(getWriteConfig().getClusteringMaxBytesInGroup()));

return HoodieDatasetBulkInsertHelper.bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig,
getRowPartitioner(strategyParams, schema), numOutputGroups, shouldPreserveHoodieMetadata);
BulkInsertPartitioner<Dataset<Row>> partitioner = getRowPartitioner(strategyParams, schema);
Dataset<Row> repartitionedRecords = partitioner.repartitionRecords(inputRecords, numOutputGroups);
Comment thread
codope marked this conversation as resolved.

return HoodieDatasetBulkInsertHelper.bulkInsert(repartitionedRecords, instantTime, getHoodieTable(), newConfig,
partitioner.arePartitionRecordsSorted(), shouldPreserveHoodieMetadata);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,10 @@ object HoodieDatasetBulkInsertHelper
instantTime: String,
table: HoodieTable[_, _, _, _],
writeConfig: HoodieWriteConfig,
partitioner: BulkInsertPartitioner[Dataset[Row]],
parallelism: Int,
arePartitionRecordsSorted: Boolean,
shouldPreserveHoodieMetadata: Boolean): HoodieData[WriteStatus] = {
val repartitionedDataset = partitioner.repartitionRecords(dataset, parallelism)
val arePartitionRecordsSorted = partitioner.arePartitionRecordsSorted
val schema = dataset.schema
val writeStatuses = repartitionedDataset.queryExecution.toRdd.mapPartitions(iter => {
val writeStatuses = dataset.queryExecution.toRdd.mapPartitions(iter => {
val taskContextSupplier: TaskContextSupplier = table.getTaskContextSupplier
val taskPartitionId = taskContextSupplier.getPartitionIdSupplier.get
val taskId = taskContextSupplier.getStageIdSupplier.get.toLong
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hudi.commit;

import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.HoodieDatasetBulkInsertHelper;
import org.apache.hudi.client.HoodieWriteResult;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.util.CommitUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.execution.bulkinsert.BucketIndexBulkInsertPartitionerWithRows;
import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerWithRowsFactory;
import org.apache.hudi.execution.bulkinsert.NonSortPartitionerWithRows;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;

import java.io.Serializable;
import java.util.Collections;
import java.util.List;
import java.util.Map;

public abstract class BaseDatasetBulkInsertCommitActionExecutor implements Serializable {

protected final transient HoodieWriteConfig writeConfig;
protected final transient SparkRDDWriteClient writeClient;
protected final String instantTime;
protected HoodieTable table;

public BaseDatasetBulkInsertCommitActionExecutor(HoodieWriteConfig config,
SparkRDDWriteClient writeClient,
String instantTime) {
this.writeConfig = config;
this.writeClient = writeClient;
this.instantTime = instantTime;
}

protected void preExecute() {
table.validateInsertSchema();
writeClient.startCommitWithTime(instantTime, getCommitActionType());
writeClient.preWrite(instantTime, getWriteOperationType(), table.getMetaClient());
}

protected abstract Option<HoodieData<WriteStatus>> doExecute(Dataset<Row> records, boolean arePartitionRecordsSorted);

protected void afterExecute(HoodieWriteMetadata<JavaRDD<WriteStatus>> result) {
writeClient.postWrite(result, instantTime, table);
}

private HoodieWriteMetadata<JavaRDD<WriteStatus>> buildHoodieWriteMetadata(Option<HoodieData<WriteStatus>> writeStatuses) {
return writeStatuses.map(statuses -> {
HoodieWriteMetadata<JavaRDD<WriteStatus>> hoodieWriteMetadata = new HoodieWriteMetadata<>();
hoodieWriteMetadata.setWriteStatuses(HoodieJavaRDD.getJavaRDD(statuses));
hoodieWriteMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(statuses));
return hoodieWriteMetadata;
}).orElse(new HoodieWriteMetadata<>());
}

public final HoodieWriteResult execute(Dataset<Row> records, boolean isTablePartitioned) {
if (writeConfig.getBoolean(DataSourceWriteOptions.INSERT_DROP_DUPS())) {
throw new HoodieException("Dropping duplicates with bulk_insert in row writer path is not supported yet");
}

boolean populateMetaFields = writeConfig.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS);

BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows = getPartitioner(populateMetaFields, isTablePartitioned);
boolean shouldDropPartitionColumns = writeConfig.getBoolean(DataSourceWriteOptions.DROP_PARTITION_COLUMNS());
Dataset<Row> hoodieDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(records, writeConfig, bulkInsertPartitionerRows, shouldDropPartitionColumns, instantTime);

table = writeClient.initTable(getWriteOperationType(), Option.ofNullable(instantTime));
preExecute();
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = buildHoodieWriteMetadata(doExecute(hoodieDF, bulkInsertPartitionerRows.arePartitionRecordsSorted()));
afterExecute(result);

return new HoodieWriteResult(result.getWriteStatuses(), result.getPartitionToReplaceFileIds());
}

public abstract WriteOperationType getWriteOperationType();

public String getCommitActionType() {
return CommitUtils.getCommitActionType(getWriteOperationType(), writeClient.getConfig().getTableType());
}

protected BulkInsertPartitioner<Dataset<Row>> getPartitioner(boolean populateMetaFields, boolean isTablePartitioned) {
if (populateMetaFields) {
if (writeConfig.getIndexType() == HoodieIndex.IndexType.BUCKET) {
return new BucketIndexBulkInsertPartitionerWithRows(writeConfig.getBucketIndexHashFieldWithDefault(),
writeConfig.getBucketIndexNumBuckets());
} else {
return DataSourceUtils
.createUserDefinedBulkInsertPartitionerWithRows(writeConfig)
.orElseGet(() -> BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig, isTablePartitioned));
}
} else {
// Sort modes are not yet supported when meta fields are disabled
return new NonSortPartitionerWithRows();
}
}

protected Map<String, List<String>> getPartitionToReplacedFileIds(HoodieData<WriteStatus> writeStatuses) {
return Collections.emptyMap();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hudi.commit;

import org.apache.hudi.HoodieSparkUtils;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieInternalConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.internal.DataSourceInternalWriterHelper;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;

public class DatasetBulkInsertCommitActionExecutor extends BaseDatasetBulkInsertCommitActionExecutor {

public DatasetBulkInsertCommitActionExecutor(HoodieWriteConfig config,
SparkRDDWriteClient writeClient,
String instantTime) {
super(config, writeClient, instantTime);
}

@Override
protected void preExecute() {
// no op
}

@Override
protected Option<HoodieData<WriteStatus>> doExecute(Dataset<Row> records, boolean arePartitionRecordsSorted) {
Map<String, String> opts = writeConfig.getProps().entrySet().stream().collect(Collectors.toMap(
e -> String.valueOf(e.getKey()),
e -> String.valueOf(e.getValue())));
Map<String, String> optsOverrides = Collections.singletonMap(
HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, String.valueOf(arePartitionRecordsSorted));

String targetFormat;
Map<String, String> customOpts = new HashMap<>(1);
if (HoodieSparkUtils.isSpark2()) {
targetFormat = "org.apache.hudi.internal";
} else if (HoodieSparkUtils.isSpark3()) {
targetFormat = "org.apache.hudi.spark3.internal";
customOpts.put(HoodieInternalConfig.BULKINSERT_INPUT_DATA_SCHEMA_DDL.key(), records.schema().json());
} else {
throw new HoodieException("Bulk insert using row writer is not supported with current Spark version."
+ " To use row writer please switch to spark 2 or spark 3");
}

records.write().format(targetFormat)
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
.options(opts)
.options(customOpts)
.options(optsOverrides)
.mode(SaveMode.Append)
.save();
return Option.empty();
}

@Override
protected void afterExecute(HoodieWriteMetadata<JavaRDD<WriteStatus>> result) {
// no op
}

@Override
public WriteOperationType getWriteOperationType() {
return WriteOperationType.BULK_INSERT;
}
}
Loading