-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-3213] Making commit preserve metadata to true for compaction #4811
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
75583d8
f535954
16eddfc
6a244bc
8b55003
0f991a1
8daae6a
0d32395
8ccef8c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,6 +61,8 @@ | |
| import java.util.Map; | ||
| import java.util.Set; | ||
|
|
||
| import static org.apache.hudi.common.model.HoodieRecord.FILENAME_METADATA_FIELD_POS; | ||
|
|
||
| @SuppressWarnings("Duplicates") | ||
| /** | ||
| * Handle to merge incoming records to those in storage. | ||
|
|
@@ -262,7 +264,7 @@ private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, GenericRecord ol | |
| isDelete = HoodieOperation.isDelete(hoodieRecord.getOperation()); | ||
| } | ||
| } | ||
| return writeRecord(hoodieRecord, indexedRecord, isDelete); | ||
| return writeRecord(hoodieRecord, indexedRecord, isDelete, oldRecord); | ||
| } | ||
|
|
||
| protected void writeInsertRecord(HoodieRecord<T> hoodieRecord) throws IOException { | ||
|
|
@@ -272,16 +274,16 @@ protected void writeInsertRecord(HoodieRecord<T> hoodieRecord) throws IOExceptio | |
| if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) { | ||
| return; | ||
| } | ||
| if (writeRecord(hoodieRecord, insertRecord, HoodieOperation.isDelete(hoodieRecord.getOperation()))) { | ||
| if (writeRecord(hoodieRecord, insertRecord, HoodieOperation.isDelete(hoodieRecord.getOperation()), null)) { | ||
| insertRecordsWritten++; | ||
| } | ||
| } | ||
|
|
||
| protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) { | ||
| return writeRecord(hoodieRecord, indexedRecord, false); | ||
| return writeRecord(hoodieRecord, indexedRecord, false, null); | ||
| } | ||
|
|
||
| protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord, boolean isDelete) { | ||
| protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord, boolean isDelete, GenericRecord oldRecord) { | ||
| Option recordMetadata = hoodieRecord.getData().getMetadata(); | ||
| if (!partitionPath.equals(hoodieRecord.getPartitionPath())) { | ||
| HoodieUpsertException failureEx = new HoodieUpsertException("mismatched partition path, record partition: " | ||
|
|
@@ -292,8 +294,10 @@ protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord | |
| try { | ||
| if (indexedRecord.isPresent() && !isDelete) { | ||
| // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema | ||
| IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) indexedRecord.get()); | ||
| if (preserveMetadata) { | ||
| IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) indexedRecord.get(), preserveMetadata, oldRecord); | ||
| if (preserveMetadata && useWriterSchema) { // useWriteSchema will be true only incase of compaction. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if this is the case, better name could be |
||
| // do not preserve FILENAME_METADATA_FIELD | ||
| recordWithMetadataInSchema.put(FILENAME_METADATA_FIELD_POS, newFilePath.getName()); | ||
| fileWriter.writeAvro(hoodieRecord.getRecordKey(), recordWithMetadataInSchema); | ||
| } else { | ||
| fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, hoodieRecord); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -176,7 +176,7 @@ public static Schema addMetadataFields(Schema schema) { | |
| /** | ||
| * Adds the Hoodie metadata fields to the given schema. | ||
| * | ||
| * @param schema The schema | ||
| * @param schema The schema | ||
| * @param withOperationField Whether to include the '_hoodie_operation' field | ||
| */ | ||
| public static Schema addMetadataFields(Schema schema, boolean withOperationField) { | ||
|
|
@@ -276,7 +276,7 @@ public static Schema getSchemaForFields(Schema fileSchema, List<String> fields) | |
| List<Schema.Field> toBeAddedFields = new ArrayList<>(); | ||
| Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false); | ||
|
|
||
| for (Schema.Field schemaField: fileSchema.getFields()) { | ||
| for (Schema.Field schemaField : fileSchema.getFields()) { | ||
| if (fields.contains(schemaField.name())) { | ||
| toBeAddedFields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), schemaField.defaultVal())); | ||
| } | ||
|
|
@@ -303,7 +303,7 @@ public static GenericRecord addOperationToRecord(GenericRecord record, HoodieOpe | |
| * engines have varying constraints regarding treating the case-sensitivity of fields, its best to let caller | ||
| * determine that. | ||
| * | ||
| * @param schema Passed in schema | ||
| * @param schema Passed in schema | ||
| * @param newFieldNames Null Field names to be added | ||
| */ | ||
| public static Schema appendNullSchemaFields(Schema schema, List<String> newFieldNames) { | ||
|
|
@@ -382,10 +382,34 @@ public static GenericRecord rewriteRecord(GenericRecord oldRecord, Schema newSch | |
| return newRecord; | ||
| } | ||
|
|
||
| public static GenericRecord rewriteRecord(GenericRecord genericRecord, Schema newSchema, boolean copyOverMetaFields, GenericRecord fallbackRecord) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be good to have at least 1 UT covering this |
||
| GenericRecord newRecord = new GenericData.Record(newSchema); | ||
| boolean isSpecificRecord = genericRecord instanceof SpecificRecordBase; | ||
| for (Schema.Field f : newSchema.getFields()) { | ||
| if (!(isSpecificRecord && isMetadataField(f.name()))) { | ||
| copyOldValueOrSetDefault(genericRecord, newRecord, f); | ||
| } | ||
| if (isMetadataField(f.name()) && copyOverMetaFields) { | ||
| // if meta field exists in primary generic record, copy over. | ||
| if (genericRecord.getSchema().getField(f.name()) != null) { | ||
| copyOldValueOrSetDefault(genericRecord, newRecord, f); | ||
| } else if (fallbackRecord != null && fallbackRecord.getSchema().getField(f.name()) != null) { | ||
| // if not, try to copy from the fallback record. | ||
| copyOldValueOrSetDefault(fallbackRecord, newRecord, f); | ||
| } | ||
| } | ||
| } | ||
| if (!GenericData.get().validate(newSchema, newRecord)) { | ||
| throw new SchemaCompatibilityException( | ||
| "Unable to validate the rewritten record " + genericRecord + " against schema " + newSchema); | ||
| } | ||
| return newRecord; | ||
| } | ||
|
|
||
| /** | ||
| * Converts list of {@link GenericRecord} provided into the {@link GenericRecord} adhering to the | ||
| * provided {@code newSchema}. | ||
| * | ||
| * <p> | ||
| * To better understand conversion rules please check {@link #rewriteRecord(GenericRecord, Schema)} | ||
| */ | ||
| public static List<GenericRecord> rewriteRecords(List<GenericRecord> records, Schema newSchema) { | ||
|
|
@@ -491,9 +515,8 @@ public static Object getNestedFieldVal(GenericRecord record, String fieldName, b | |
| * Returns the string value of the given record {@code rec} and field {@code fieldName}. | ||
| * The field and value both could be missing. | ||
| * | ||
| * @param rec The record | ||
| * @param rec The record | ||
| * @param fieldName The field name | ||
| * | ||
| * @return the string form of the field | ||
| * or empty if the schema does not contain the field name or the value is null | ||
| */ | ||
|
|
@@ -507,7 +530,7 @@ public static Option<String> getNullableValAsString(GenericRecord rec, String fi | |
| * This method converts values for fields with certain Avro/Parquet data types that require special handling. | ||
| * | ||
| * @param fieldSchema avro field schema | ||
| * @param fieldValue avro field value | ||
| * @param fieldValue avro field value | ||
| * @return field value either converted (for certain data types) or as it is. | ||
| */ | ||
| public static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled) { | ||
|
|
@@ -527,15 +550,15 @@ public static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object | |
|
|
||
| /** | ||
| * This method converts values for fields with certain Avro Logical data types that require special handling. | ||
| * | ||
| * <p> | ||
| * Logical Date Type is converted to actual Date value instead of Epoch Integer which is how it is | ||
| * represented/stored in parquet. | ||
| * | ||
| * <p> | ||
| * Decimal Data Type is converted to actual decimal value instead of bytes/fixed which is how it is | ||
| * represented/stored in parquet. | ||
| * | ||
| * @param fieldSchema avro field schema | ||
| * @param fieldValue avro field value | ||
| * @param fieldValue avro field value | ||
| * @return field value either converted (for certain data types) or as it is. | ||
| */ | ||
| private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled) { | ||
|
|
@@ -569,6 +592,7 @@ public static Schema getNullSchema() { | |
| /** | ||
| * Sanitizes Name according to Avro rule for names. | ||
| * Removes characters other than the ones mentioned in https://avro.apache.org/docs/current/spec.html#names . | ||
| * | ||
| * @param name input name | ||
| * @return sanitized name | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,6 +42,8 @@ public abstract class HoodieRecord<T> implements Serializable { | |
| public static final String OPERATION_METADATA_FIELD = "_hoodie_operation"; | ||
| public static final String HOODIE_IS_DELETED = "_hoodie_is_deleted"; | ||
|
|
||
| public static int FILENAME_METADATA_FIELD_POS = 4; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this could be a separate cleanup task: make constants for all meta fields and adopt them across codebase |
||
|
|
||
| public static final List<String> HOODIE_META_COLUMNS = | ||
| CollectionUtils.createImmutableList(COMMIT_TIME_METADATA_FIELD, COMMIT_SEQNO_METADATA_FIELD, | ||
| RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD, FILENAME_METADATA_FIELD); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.