-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-2778] Optimize statistics collection related codes and add some docs for z-order add fix some bugs #4013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,8 +18,6 @@ | |
|
|
||
| package org.apache.hudi.common.model; | ||
|
|
||
| import org.apache.parquet.schema.PrimitiveStringifier; | ||
|
|
||
| import java.util.Objects; | ||
|
|
||
| /** | ||
|
|
@@ -30,16 +28,28 @@ public class HoodieColumnRangeMetadata<T> { | |
| private final String columnName; | ||
| private final T minValue; | ||
| private final T maxValue; | ||
| private final long numNulls; | ||
| private final PrimitiveStringifier stringifier; | ||
| private long numNulls; | ||
| // For Decimal Type/Date Type, minValue/maxValue cannot represent it's original value. | ||
| // eg: when parquet collects column information, the decimal type is collected as int/binary type. | ||
| // so we cannot use minValue and maxValue directly, use minValueAsString/maxValueAsString instead. | ||
| private final String minValueAsString; | ||
|
||
| private final String maxValueAsString; | ||
|
|
||
| public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls, final PrimitiveStringifier stringifier) { | ||
| public HoodieColumnRangeMetadata( | ||
| final String filePath, | ||
| final String columnName, | ||
| final T minValue, | ||
| final T maxValue, | ||
| long numNulls, | ||
| final String minValueAsString, | ||
| final String maxValueAsString) { | ||
| this.filePath = filePath; | ||
| this.columnName = columnName; | ||
| this.minValue = minValue; | ||
| this.maxValue = maxValue; | ||
| this.numNulls = numNulls; | ||
| this.stringifier = stringifier; | ||
| this.numNulls = numNulls == -1 ? 0 : numNulls; | ||
|
||
| this.minValueAsString = minValueAsString; | ||
| this.maxValueAsString = maxValueAsString; | ||
| } | ||
|
|
||
| public String getFilePath() { | ||
|
|
@@ -58,8 +68,12 @@ public T getMaxValue() { | |
| return this.maxValue; | ||
| } | ||
|
|
||
| public PrimitiveStringifier getStringifier() { | ||
| return stringifier; | ||
| public String getMaxValueAsString() { | ||
|
||
| return maxValueAsString; | ||
| } | ||
|
|
||
| public String getMinValueAsString() { | ||
| return minValueAsString; | ||
| } | ||
|
|
||
| public long getNumNulls() { | ||
|
|
@@ -79,12 +93,14 @@ public boolean equals(final Object o) { | |
| && Objects.equals(getColumnName(), that.getColumnName()) | ||
| && Objects.equals(getMinValue(), that.getMinValue()) | ||
| && Objects.equals(getMaxValue(), that.getMaxValue()) | ||
| && Objects.equals(getNumNulls(), that.getNumNulls()); | ||
| && Objects.equals(getNumNulls(), that.getNumNulls()) | ||
| && Objects.equals(getMinValueAsString(), that.getMinValueAsString()) | ||
| && Objects.equals(getMaxValueAsString(), that.getMaxValueAsString()); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNumNulls()); | ||
| return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNumNulls(), getMinValueAsString(), getMaxValueAsString()); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -94,6 +110,8 @@ public String toString() { | |
| + "columnName='" + columnName + '\'' | ||
| + ", minValue=" + minValue | ||
| + ", maxValue=" + maxValue | ||
| + ", numNulls=" + numNulls + '}'; | ||
| + ", numNulls=" + numNulls | ||
| + ", minValueAsString=" + minValueAsString | ||
| + ", minValueAsString=" + maxValueAsString + '}'; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,7 @@ | |
| import org.apache.parquet.hadoop.metadata.BlockMetaData; | ||
| import org.apache.parquet.hadoop.metadata.ParquetMetadata; | ||
| import org.apache.parquet.schema.MessageType; | ||
| import org.apache.parquet.schema.OriginalType; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
|
|
@@ -56,6 +57,8 @@ | |
| */ | ||
| public class ParquetUtils extends BaseFileUtils { | ||
|
|
||
| private static Object lock = new Object(); | ||
|
|
||
| /** | ||
| * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will | ||
| * return all the rowkeys. | ||
|
|
@@ -283,17 +286,38 @@ public Boolean apply(String recordKey) { | |
|
|
||
| /** | ||
| * Parse min/max statistics stored in parquet footers for all columns. | ||
| * ParquetRead.readFooter is not a thread safe method. | ||
| * | ||
| * @param conf hadoop conf. | ||
| * @param parquetFilePath file to be read. | ||
| * @param cols cols which need to collect statistics. | ||
| * @return a HoodieColumnRangeMetadata instance. | ||
| */ | ||
| public Collection<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(Configuration conf, Path parquetFilePath, List<String> cols) { | ||
| public Collection<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata( | ||
| Configuration conf, | ||
| Path parquetFilePath, | ||
| List<String> cols) { | ||
| ParquetMetadata metadata = readMetadata(conf, parquetFilePath); | ||
| // collect stats from all parquet blocks | ||
| Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = metadata.getBlocks().stream().flatMap(blockMetaData -> { | ||
| return blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData -> | ||
| new HoodieColumnRangeMetadata<>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(), | ||
| columnChunkMetaData.getStatistics().genericGetMin(), | ||
| columnChunkMetaData.getStatistics().genericGetMax(), | ||
| columnChunkMetaData.getStatistics().getNumNulls(), | ||
| columnChunkMetaData.getPrimitiveType().stringifier())); | ||
| return blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData -> { | ||
| String minAsString; | ||
| String maxAsString; | ||
| if (columnChunkMetaData.getPrimitiveType().getOriginalType() == OriginalType.DATE) { | ||
| synchronized (lock) { | ||
|
||
| minAsString = columnChunkMetaData.getStatistics().minAsString(); | ||
| maxAsString = columnChunkMetaData.getStatistics().maxAsString(); | ||
| } | ||
| } else { | ||
| minAsString = columnChunkMetaData.getStatistics().minAsString(); | ||
| maxAsString = columnChunkMetaData.getStatistics().maxAsString(); | ||
| } | ||
| return new HoodieColumnRangeMetadata<>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(), | ||
| columnChunkMetaData.getStatistics().genericGetMin(), | ||
| columnChunkMetaData.getStatistics().genericGetMax(), | ||
| columnChunkMetaData.getStatistics().getNumNulls(), | ||
| minAsString, maxAsString); | ||
| }); | ||
| }).collect(Collectors.groupingBy(e -> e.getColumnName())); | ||
|
|
||
| // we only intend to keep file level statistics. | ||
|
|
@@ -316,23 +340,41 @@ private HoodieColumnRangeMetadata<Comparable> combineRanges(HoodieColumnRangeMet | |
| HoodieColumnRangeMetadata<Comparable> range2) { | ||
| final Comparable minValue; | ||
| final Comparable maxValue; | ||
| final String minValueAsString; | ||
| final String maxValueAsString; | ||
| if (range1.getMinValue() != null && range2.getMinValue() != null) { | ||
| minValue = range1.getMinValue().compareTo(range2.getMinValue()) < 0 ? range1.getMinValue() : range2.getMinValue(); | ||
| if (range1.getMinValue().compareTo(range2.getMinValue()) < 0) { | ||
| minValue = range1.getMinValue(); | ||
| minValueAsString = range1.getMinValueAsString(); | ||
| } else { | ||
| minValue = range2.getMinValue(); | ||
| minValueAsString = range2.getMinValueAsString(); | ||
| } | ||
| } else if (range1.getMinValue() == null) { | ||
| minValue = range2.getMinValue(); | ||
| minValueAsString = range2.getMinValueAsString(); | ||
| } else { | ||
| minValue = range1.getMinValue(); | ||
| minValueAsString = range1.getMinValueAsString(); | ||
| } | ||
|
|
||
| if (range1.getMaxValue() != null && range2.getMaxValue() != null) { | ||
| maxValue = range1.getMaxValue().compareTo(range2.getMaxValue()) < 0 ? range2.getMaxValue() : range1.getMaxValue(); | ||
| if (range1.getMaxValue().compareTo(range2.getMaxValue()) < 0) { | ||
| maxValue = range2.getMaxValue(); | ||
| maxValueAsString = range2.getMaxValueAsString(); | ||
| } else { | ||
| maxValue = range1.getMaxValue(); | ||
| maxValueAsString = range1.getMaxValueAsString(); | ||
| } | ||
| } else if (range1.getMaxValue() == null) { | ||
| maxValue = range2.getMaxValue(); | ||
| maxValueAsString = range2.getMaxValueAsString(); | ||
| } else { | ||
| maxValue = range1.getMaxValue(); | ||
| maxValueAsString = range1.getMaxValueAsString(); | ||
| } | ||
|
|
||
| return new HoodieColumnRangeMetadata<>(range1.getFilePath(), | ||
| range1.getColumnName(), minValue, maxValue, range1.getNumNulls() + range2.getNumNulls(), range1.getStringifier()); | ||
| range1.getColumnName(), minValue, maxValue, range1.getNumNulls() + range2.getNumNulls(), minValueAsString, maxValueAsString); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -100,7 +100,7 @@ object DataSkippingUtils { | |
| // query filter "colA >= b" convert it to "colA_maxValue >= b" for index table | ||
| case GreaterThanOrEqual(attribute: AttributeReference, right: Literal) => | ||
| val colName = getTargetColNameParts(attribute) | ||
| GreaterThanOrEqual(maxValue(colName), right) | ||
| reWriteCondition(colName, GreaterThanOrEqual(maxValue(colName), right)) | ||
| // query filter "b >= colA" convert it to "colA_minValue <= b" for index table | ||
| case GreaterThanOrEqual(value: Literal, attribute: AttributeReference) => | ||
| val colName = getTargetColNameParts(attribute) | ||
|
|
@@ -179,7 +179,7 @@ object DataSkippingUtils { | |
| def getIndexFiles(conf: Configuration, indexPath: String): Seq[FileStatus] = { | ||
| val basePath = new Path(indexPath) | ||
| basePath.getFileSystem(conf) | ||
| .listStatus(basePath).filterNot(f => f.getPath.getName.endsWith(".parquet")) | ||
| .listStatus(basePath).filter(f => f.getPath.getName.endsWith(".parquet")) | ||
| } | ||
|
||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wondering how much of this should go into the config docs itself?