-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-6367] Fix NPE in HoodieAvroParquetReader and support complex schema with timestamp #8955
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a8c5325
fc4b139
de5ed6c
6c16144
02ddd51
ccd6825
f1437ab
9bad575
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,10 @@ | |
|
|
||
| package org.apache.hudi.hadoop.avro; | ||
|
|
||
| import org.apache.hudi.avro.HoodieAvroUtils; | ||
| import org.apache.hudi.hadoop.HoodieColumnProjectionUtils; | ||
| import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; | ||
|
|
||
| import org.apache.avro.Schema; | ||
| import org.apache.avro.generic.GenericData; | ||
| import org.apache.avro.generic.GenericRecord; | ||
|
|
@@ -26,8 +30,6 @@ | |
| import org.apache.hadoop.mapreduce.InputSplit; | ||
| import org.apache.hadoop.mapreduce.RecordReader; | ||
| import org.apache.hadoop.mapreduce.TaskAttemptContext; | ||
| import org.apache.hudi.hadoop.HoodieColumnProjectionUtils; | ||
| import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; | ||
| import org.apache.parquet.avro.AvroReadSupport; | ||
| import org.apache.parquet.avro.AvroSchemaConverter; | ||
| import org.apache.parquet.format.converter.ParquetMetadataConverter; | ||
|
|
@@ -40,7 +42,6 @@ | |
| import java.io.IOException; | ||
| import java.util.Arrays; | ||
| import java.util.List; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import static org.apache.parquet.hadoop.ParquetInputFormat.getFilter; | ||
|
|
||
|
|
@@ -50,24 +51,20 @@ public class HoodieAvroParquetReader extends RecordReader<Void, ArrayWritable> { | |
| private Schema baseSchema; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my origin PR HUDI-83 I didn't declare the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I have tested that
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Zouxxyy
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cdmikechen Have you ever tested -- spark-sql
create table test_ts_1(
id int,
ts1 timestamp)
using hudi
tblproperties(
type='mor',
primaryKey='id'
);
INSERT INTO test_ts_1
SELECT 1,
cast ('2021-12-25 12:01:01' as timestamp);
create table test_ts_2(
id int,
ts1 array<timestamp>,
ts2 map<string, timestamp>,
ts3 struct<province:timestamp, city:string>)
using hudi
tblproperties(
type='mor',
primaryKey='id'
);
INSERT INTO test_ts_2
SELECT 1,
array(cast ('2021-12-25 12:01:01' as timestamp)),
map('key', cast ('2021-12-25 12:01:01' as timestamp)),
struct(cast ('2021-12-25 12:01:01' as timestamp), 'test');
-- hive
select * from test_ts_1;
select id from test_ts_1;
select ts1 from test_ts_1;
select id, ts1 from test_ts_1;
select count(*) from test_ts_1;
select * from test_ts_2;
select id from test_ts_2;
select ts1 from test_ts_2;
select id, ts1 from test_ts_2;
select count(*) from test_ts_2;CC @danny0405 @xicm |
||
|
|
||
| public HoodieAvroParquetReader(InputSplit inputSplit, Configuration conf) throws IOException { | ||
| // get base schema | ||
| ParquetMetadata fileFooter = | ||
| ParquetFileReader.readFooter(conf, ((ParquetInputSplit) inputSplit).getPath(), ParquetMetadataConverter.NO_FILTER); | ||
| MessageType messageType = fileFooter.getFileMetaData().getSchema(); | ||
| baseSchema = new AvroSchemaConverter(conf).convert(messageType); | ||
|
|
||
| // if exists read columns, we need to filter columns. | ||
| List<String> readColNames = Arrays.asList(HoodieColumnProjectionUtils.getReadColumnNames(conf)); | ||
| if (!readColNames.isEmpty()) { | ||
| // get base schema | ||
| ParquetMetadata fileFooter = | ||
| ParquetFileReader.readFooter(conf, ((ParquetInputSplit) inputSplit).getPath(), ParquetMetadataConverter.NO_FILTER); | ||
| MessageType messageType = fileFooter.getFileMetaData().getSchema(); | ||
| baseSchema = new AvroSchemaConverter(conf).convert(messageType); | ||
| // filter schema for reading | ||
| final Schema filterSchema = Schema.createRecord(baseSchema.getName(), | ||
| baseSchema.getDoc(), baseSchema.getNamespace(), baseSchema.isError(), | ||
| baseSchema.getFields().stream() | ||
| .filter(f -> readColNames.contains(f.name())) | ||
| .map(f -> new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal())) | ||
| .collect(Collectors.toList())); | ||
| Schema filterSchema = HoodieAvroUtils.generateProjectionSchema(baseSchema, readColNames); | ||
Zouxxyy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| AvroReadSupport.setAvroReadSchema(conf, filterSchema); | ||
| AvroReadSupport.setRequestedProjection(conf, filterSchema); | ||
| } | ||
|
|
||
| parquetRecordReader = new ParquetRecordReader<>(new AvroReadSupport<>(), getFilter(conf)); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hudi.hadoop; | ||
|
|
||
| import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; | ||
| import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; | ||
| import org.junit.jupiter.api.Test; | ||
|
|
||
| import static org.junit.jupiter.api.Assertions.assertFalse; | ||
| import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
|
||
| public class TestHoodieColumnProjectionUtils { | ||
|
|
||
| @Test | ||
| void testTypeContainsTimestamp() { | ||
| String col1 = "timestamp"; | ||
| TypeInfo typeInfo1 = TypeInfoUtils.getTypeInfosFromTypeString(col1).get(0); | ||
| assertTrue(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo1)); | ||
|
|
||
| String col2 = "string"; | ||
| TypeInfo typeInfo2 = TypeInfoUtils.getTypeInfosFromTypeString(col2).get(0); | ||
| assertFalse(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo2)); | ||
|
|
||
| String col3 = "array<timestamp>"; | ||
| TypeInfo typeInfo3 = TypeInfoUtils.getTypeInfosFromTypeString(col3).get(0); | ||
| assertTrue(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo3)); | ||
|
|
||
| String col4 = "array<string>"; | ||
| TypeInfo typeInfo4 = TypeInfoUtils.getTypeInfosFromTypeString(col4).get(0); | ||
| assertFalse(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo4)); | ||
|
|
||
| String col5 = "map<string,timestamp>"; | ||
| TypeInfo typeInfo5 = TypeInfoUtils.getTypeInfosFromTypeString(col5).get(0); | ||
| assertTrue(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo5)); | ||
|
|
||
| String col6 = "map<string,string>"; | ||
| TypeInfo typeInfo6 = TypeInfoUtils.getTypeInfosFromTypeString(col6).get(0); | ||
| assertFalse(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo6)); | ||
|
|
||
| String col7 = "struct<name1:string,name2:timestamp>"; | ||
| TypeInfo typeInfo7 = TypeInfoUtils.getTypeInfosFromTypeString(col7).get(0); | ||
| assertTrue(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo7)); | ||
|
|
||
| String col8 = "struct<name1:string,name2:string>"; | ||
| TypeInfo typeInfo8 = TypeInfoUtils.getTypeInfosFromTypeString(col8).get(0); | ||
| assertFalse(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo8)); | ||
|
|
||
| String col9 = "uniontype<string,timestamp>"; | ||
| TypeInfo typeInfo9 = TypeInfoUtils.getTypeInfosFromTypeString(col9).get(0); | ||
| assertTrue(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo9)); | ||
|
|
||
| String col10 = "uniontype<string,int>"; | ||
| TypeInfo typeInfo10 = TypeInfoUtils.getTypeInfosFromTypeString(col10).get(0); | ||
| assertFalse(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo10)); | ||
|
|
||
| String col11 = "uniontype<string,int,map<string,array<timestamp>>>"; | ||
| TypeInfo typeInfo11 = TypeInfoUtils.getTypeInfosFromTypeString(col11).get(0); | ||
| assertTrue(HoodieColumnProjectionUtils.typeContainsTimestamp(typeInfo11)); | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@xicm Here I think it should return false directly, what do you think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree with you, @cdmikechen do you have any other concern?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When the
readColscan be empty ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As far as I know, such as
count(*)which doesn't need to read any colsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In such case, the timestamp can be read correctly anyway?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes