From 2c2fc1284f78bea1319145ffe1f76a77c91ffb92 Mon Sep 17 00:00:00 2001 From: xiarixiaoyao Date: Fri, 23 Jul 2021 09:32:29 +0800 Subject: [PATCH 1/3] [HUDI-2101]support z-order for hudi --- .../hudi/config/HoodieClusteringConfig.java | 103 +++ .../apache/hudi/config/HoodieWriteConfig.java | 25 + .../apache/hudi/optimize/UnsafeAccess.java | 74 ++ .../apache/hudi/optimize/ZOrderingUtil.java | 228 +++++ .../org/apache/hudi/table/HoodieTable.java | 10 + .../hudi/optimize/TestZOrderingUtil.java | 129 +++ .../table/HoodieFlinkCopyOnWriteTable.java | 6 + .../table/HoodieJavaCopyOnWriteTable.java | 6 + .../hudi/client/SparkRDDWriteClient.java | 6 + .../SparkSortAndSizeExecutionStrategy.java | 8 +- ...atialCurveOptimizationSortPartitioner.java | 98 +++ .../table/HoodieSparkCopyOnWriteTable.java | 31 + .../action/commit/SparkBulkInsertHelper.java | 2 +- .../org/apache/spark/sql/Zoptimize.scala | 830 ++++++++++++++++++ .../sql/hudi/execution/RangeSample.scala | 239 +++++ .../common/table/HoodieTableMetaClient.java | 8 + .../org/apache/hudi/HoodieFileIndex.scala | 79 +- .../org/apache/hudi/TestOptimizeTable.scala | 86 ++ 18 files changed, 1960 insertions(+), 8 deletions(-) create mode 100644 hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java create mode 100644 hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java create mode 100644 hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java create mode 100644 hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java create mode 100644 hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala create mode 100644 hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala create mode 100644 hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java index c93907c4a33bf..684ff1459a2ad 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java @@ -22,10 +22,12 @@ import org.apache.hudi.common.config.ConfigGroups; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.exception.HoodieException; import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.util.Locale; import java.util.Properties; /** @@ -40,6 +42,9 @@ public class HoodieClusteringConfig extends HoodieConfig { // Any strategy specific params can be saved with this prefix public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy."; + // Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix + public static final String DATA_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize."; + public static final ConfigProperty DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions") .defaultValue("2") @@ -137,6 +142,55 @@ public class HoodieClusteringConfig extends HoodieConfig { .sinceVersion("0.9.0") .withDocumentation("When rewriting data, preserves existing hoodie_commit_time"); + public static final ConfigProperty SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE = ConfigProperty + .key(DATA_OPTIMIZE_PARAM_PREFIX + "space.filling.curve.data.optimize.enable") + .defaultValue(false) + .sinceVersion("0.10.0") + .withDocumentation("config to use z-ordering/space-filling curves to optimize the layout of table to boost query performance"); + + public static final ConfigProperty DATA_OPTIMIZE_STRATEGY = ConfigProperty + .key(DATA_OPTIMIZE_PARAM_PREFIX + "strategy") + .defaultValue("z-order") + .sinceVersion("0.10.0") + .withDocumentation("config to provide a way to optimize data layout for table, current only support z-order and hilbert"); + + /** + * There exists two method to build z-curve. + * one is directly mapping sort cols to z-value to build z-curve; + * we can find this method in Amazon DynamoDB https://aws.amazon.com/cn/blogs/database/tag/z-order/ + * the other one is Boundary-based Interleaved Index method which we proposed. simply call it sample method. + * Refer to rfc-28 for specific algorithm flow. + * Boundary-based Interleaved Index method has better generalization, but the build speed is slower than direct method. + */ + public static final ConfigProperty DATA_OPTIMIZE_BUILD_CURVE_STRATEGY = ConfigProperty + .key(DATA_OPTIMIZE_PARAM_PREFIX + "build.curve.strategy") + .defaultValue("direct") + .sinceVersion("0.10.0") + .withDocumentation("Config to provide whether use direct/sample method to build curve optimize for data layout," + + "build curve_optimize by directly method is faster than by sample method, however sample method produce a better data layout." + + "now support two strategies: directly,sample"); + /** + * Doing sample for table data is the first step in Boundary-based Interleaved Index method. + * larger sample number means better optimize result, but more memory consumption + */ + public static final ConfigProperty DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER = ConfigProperty + .key(DATA_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.number") + .defaultValue("200000") + .sinceVersion("0.10.0") + .withDocumentation("when set" + DATA_OPTIMIZE_BUILD_CURVE_STRATEGY.key() + " to sample method, sample number need to be set for it." + + " larger number means better layout result, but more memory consumer"); + + /** + * The best way to use Z-order/Space-filling curves is to cooperate with Data-Skipping + * with data-skipping query engine can greatly reduce the number of table files to be read. + * otherwise query engine can only do row-group skipping for files (parquet/orc) + */ + public static final ConfigProperty DATA_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty + .key(DATA_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable") + .defaultValue(true) + .sinceVersion("0.10.0") + .withDocumentation("enable dataSkipping for hudi, when optimize finished, statistics will be collected which used for dataSkipping"); + /** * @deprecated Use {@link #PLAN_STRATEGY_CLASS_NAME} and its methods instead */ @@ -350,9 +404,58 @@ public Builder withPreserveHoodieCommitMetadata(Boolean preserveHoodieCommitMeta return this; } + public Builder withSpaceFillingCurveDataOptimizeEnable(Boolean enable) { + clusteringConfig.setValue(SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE, String.valueOf(enable)); + return this; + } + + public Builder withDataOptimizeStrategy(String strategy) { + clusteringConfig.setValue(DATA_OPTIMIZE_STRATEGY, strategy); + return this; + } + + public Builder withDataOptimizeBuildCurveStrategy(String method) { + clusteringConfig.setValue(DATA_OPTIMIZE_BUILD_CURVE_STRATEGY, method); + return this; + } + + public Builder withDataOptimizeBuildCurveSampleNumber(int sampleNumber) { + clusteringConfig.setValue(DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER, String.valueOf(sampleNumber)); + return this; + } + + public Builder withDataOptimizeDataSkippingEnable(boolean dataSkipping) { + clusteringConfig.setValue(DATA_OPTIMIZE_DATA_SKIPPING_ENABLE, String.valueOf(dataSkipping)); + return this; + } + public HoodieClusteringConfig build() { clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName()); return clusteringConfig; } } + + /** + * strategy types for build z-ordering/space-filling curves. + */ + public enum BuildCurveStrategyType { + DIRECT("direct"), + SAMPLE("sample"); + private final String value; + + BuildCurveStrategyType(String value) { + this.value = value; + } + + public static BuildCurveStrategyType fromValue(String value) { + switch (value.toLowerCase(Locale.ROOT)) { + case "direct": + return DIRECT; + case "sample": + return SAMPLE; + default: + throw new HoodieException("Invalid value of Type."); + } + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index c9d8c4f117eaf..bcf6d09390916 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -1228,6 +1228,30 @@ public String getClusteringSortColumns() { return getString(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS); } + /** + * Data layout optimize properties. + */ + public boolean getSpaceFillingCurveDataOptimizeEnable() { + return getBoolean(HoodieClusteringConfig.SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE); + } + + public String getDataOptimizeStrategy() { + return getString(HoodieClusteringConfig.DATA_OPTIMIZE_STRATEGY); + } + + public HoodieClusteringConfig.BuildCurveStrategyType getOptimizeBuildCurveMethod() { + return HoodieClusteringConfig.BuildCurveStrategyType.fromValue( + getString(HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_STRATEGY)); + } + + public int getOptimizeSampleNumber() { + return getInt(HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER); + } + + public boolean getOptimizeEnableDataSkipping() { + return getBoolean(HoodieClusteringConfig.DATA_OPTIMIZE_DATA_SKIPPING_ENABLE); + } + /** * index properties. */ @@ -1776,6 +1800,7 @@ public static class Builder { private boolean isStorageConfigSet = false; private boolean isCompactionConfigSet = false; private boolean isClusteringConfigSet = false; + private boolean isOptimizeConfigSet = false; private boolean isMetricsConfigSet = false; private boolean isBootstrapConfigSet = false; private boolean isMemoryConfigSet = false; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java new file mode 100644 index 0000000000000..f8420e8374f8d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.optimize; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import sun.misc.Unsafe; + +import java.lang.reflect.Field; +import java.nio.ByteOrder; +import java.security.AccessController; +import java.security.PrivilegedAction; + +/** + * This class is copied from hbase used by Lexicographically comparison algorithm. + * we use Lexicographically comparison algorithm to do sort for z-values which needed by z-order. + * and the unsafe comparision algorithm is a faster implementation than java implementation + */ +public class UnsafeAccess { + private static final Logger LOG = LoggerFactory.getLogger(UnsafeAccess.class); + public static final Unsafe THEUNSAFE; + + /** The offset to the first element in a byte array. */ + public static final long BYTE_ARRAY_BASE_OFFSET; + + public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() + .equals(ByteOrder.LITTLE_ENDIAN); + + // This number limits the number of bytes to copy per call to Unsafe's + // copyMemory method. A limit is imposed to allow for safepoint polling + // during a large copy + static final long UNSAFE_COPY_THRESHOLD = 1024L * 1024L; + static { + THEUNSAFE = (Unsafe) AccessController.doPrivileged(new PrivilegedAction() { + @Override + public Object run() { + try { + Field f = Unsafe.class.getDeclaredField("theUnsafe"); + f.setAccessible(true); + return f.get(null); + } catch (Throwable e) { + LOG.warn("sun.misc.Unsafe is not accessible", e); + } + return null; + } + }); + + if (THEUNSAFE != null) { + BYTE_ARRAY_BASE_OFFSET = THEUNSAFE.arrayBaseOffset(byte[].class); + } else { + BYTE_ARRAY_BASE_OFFSET = -1; + } + } + + private UnsafeAccess() { + + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java new file mode 100644 index 0000000000000..8d4217c4be5c7 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.optimize; + +import sun.misc.Unsafe; + +import java.nio.charset.Charset; + +public class ZOrderingUtil { + + static final Unsafe THEUNSAFE; + public static final int SIZEOF_LONG = Long.SIZE / Byte.SIZE; + + static { + THEUNSAFE = UnsafeAccess.THEUNSAFE; + + // sanity check - this should never fail + if (THEUNSAFE.arrayIndexScale(byte[].class) != 1) { + throw new AssertionError(); + } + } + + /** + * Lexicographically compare two arrays. + * copy from hbase + * @param buffer1 left operand + * @param buffer2 right operand + * @param offset1 Where to start comparing in the left buffer + * @param offset2 Where to start comparing in the right buffer + * @param length1 How much to compare from the left buffer + * @param length2 How much to compare from the right buffer + * @return 0 if equal, < 0 if left is less than right, etc. + */ + public static int compareTo(byte[] buffer1, int offset1, int length1, + byte[] buffer2, int offset2, int length2) { + // Short circuit equal case + if (buffer1 == buffer2 && offset1 == offset2 && length1 == length2) { + return 0; + } + final int stride = 8; + final int minLength = Math.min(length1, length2); + int strideLimit = minLength & ~(stride - 1); + final long offset1Adj = offset1 + UnsafeAccess.BYTE_ARRAY_BASE_OFFSET; + final long offset2Adj = offset2 + UnsafeAccess.BYTE_ARRAY_BASE_OFFSET; + int i; + + /* + * Compare 8 bytes at a time. Benchmarking on x86 shows a stride of 8 bytes is no slower + * than 4 bytes even on 32-bit. On the other hand, it is substantially faster on 64-bit. + */ + for (i = 0; i < strideLimit; i += stride) { + long lw = THEUNSAFE.getLong(buffer1, offset1Adj + i); + long rw = THEUNSAFE.getLong(buffer2, offset2Adj + i); + if (lw != rw) { + if (!UnsafeAccess.LITTLE_ENDIAN) { + return ((lw + Long.MIN_VALUE) < (rw + Long.MIN_VALUE)) ? -1 : 1; + } + + /* + * We want to compare only the first index where left[index] != right[index]. This + * corresponds to the least significant nonzero byte in lw ^ rw, since lw and rw are + * little-endian. Long.numberOfTrailingZeros(diff) tells us the least significant + * nonzero bit, and zeroing out the first three bits of L.nTZ gives us the shift to get + * that least significant nonzero byte. This comparison logic is based on UnsignedBytes + * comparator from guava v21 + */ + int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7; + return ((int) ((lw >>> n) & 0xFF)) - ((int) ((rw >>> n) & 0xFF)); + } + } + + // The epilogue to cover the last (minLength % stride) elements. + for (; i < minLength; i++) { + int a = (buffer1[offset1 + i] & 0xFF); + int b = (buffer2[offset2 + i] & 0xFF); + if (a != b) { + return a - b; + } + } + return length1 - length2; + } + + private static byte[] paddingTo8Byte(byte[] a) { + if (a.length == 8) { + return a; + } + if (a.length > 8) { + byte[] result = new byte[8]; + System.arraycopy(a, 0, result, 0, 8); + return result; + } + int paddingSize = 8 - a.length; + byte[] result = new byte[8]; + for (int i = 0; i < paddingSize; i++) { + result[i] = 0; + } + System.arraycopy(a, 0, result, paddingSize, a.length); + + return result; + } + + /** + * Interleaving array bytes. + * Interleaving means take one bit from the first matrix element, one bit + * from the next, etc, then take the second bit from the first matrix + * element, second bit from the second, all the way to the last bit of the + * last element. Combine those bits in that order into a single BigInteger, + * @param buffer candidate element to do interleaving + * @return byte size of candidate element + */ + public static byte[] interleaving(byte[][] buffer, int size) { + int candidateSize = buffer.length; + byte[] result = new byte[size * candidateSize]; + int resBitPos = 0; + int totalBits = size * 8; + for (int bitStep = 0; bitStep < totalBits; bitStep++) { + int currentBytePos = (int) Math.floor(bitStep / 8); + int currentBitPos = bitStep % 8; + + for (int i = 0; i < candidateSize; i++) { + int tempResBytePos = (int) Math.floor(resBitPos / 8); + int tempResBitPos = resBitPos % 8; + result[tempResBytePos] = updatePos(result[tempResBytePos], tempResBitPos, buffer[i][currentBytePos], currentBitPos); + resBitPos++; + } + } + return result; + } + + public static byte updatePos(byte a, int apos, byte b, int bpos) { + byte temp = (byte) (b & (1 << (7 - bpos))); + if (apos < bpos) { + temp = (byte) (temp << (bpos - apos)); + } + if (apos > bpos) { + temp = (byte) (temp >> (apos - bpos)); + } + byte atemp = (byte) (a & (1 << (7 - apos))); + if ((byte) (atemp ^ temp) == 0) { + return a; + } + return (byte) (a ^ (1 << (7 - apos))); + } + + public static byte[] toBytes(int val) { + byte[] b = new byte[4]; + for (int i = 3; i > 0; i--) { + b[i] = (byte) val; + val >>>= 8; + } + b[0] = (byte) val; + return b; + } + + public static byte[] toBytes(long val) { + long temp = val; + byte[] b = new byte[8]; + for (int i = 7; i > 0; i--) { + b[i] = (byte) temp; + temp >>>= 8; + } + b[0] = (byte) temp; + return b; + } + + public static byte[] toBytes(final double d) { + return toBytes(Double.doubleToRawLongBits(d)); + } + + public static byte[] intTo8Byte(int a) { + int temp = a; + temp = temp ^ (1 << 31); + return paddingTo8Byte(toBytes(temp)); + } + + public static byte[] byteTo8Byte(byte a) { + return paddingTo8Byte(new byte[] { a }); + } + + public static byte[] longTo8Byte(long a) { + long temp = a; + temp = temp ^ (1L << 63); + return toBytes(temp); + } + + public static byte[] doubleTo8Byte(double a) { + byte[] temp = toBytes(a); + if (a > 0) { + temp[0] = (byte) (temp[0] ^ (1 << 7)); + } + if (a < 0) { + for (int i = 0; i < temp.length; i++) { + temp[i] = (byte) ~temp[i]; + } + } + return temp; + } + + public static byte[] utf8To8Byte(String a) { + return paddingTo8Byte(a.getBytes(Charset.forName("utf-8"))); + } + + public static Long convertStringToLong(String a) { + byte[] bytes = utf8To8Byte(a); + long temp = 0L; + for (int i = 7; i >= 0; i--) { + temp = temp | (((long)bytes[i] & 0xff) << (7 - i) * 8); + } + return temp; + } +} + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index a6c14e6d2aea3..4b582b1d53674 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -244,6 +244,16 @@ public abstract HoodieWriteMetadata bulkInsertPrepped(HoodieEngineContext con */ public abstract HoodieWriteMetadata insertOverwriteTable(HoodieEngineContext context, String instantTime, I records); + /** + * update statistics info for current table. + * to do adaptation, once RFC-27 is finished. + * + * @param context HoodieEngineContext + * @param instantTime Instant time for the replace action + * @param isOptimizeOperation whether current operation is OPTIMIZE type + */ + public abstract void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation); + public HoodieWriteConfig getConfig() { return config; } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java new file mode 100644 index 0000000000000..7dab6c2057c77 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.optimize; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestZOrderingUtil { + + @Test + public void testIntConvert() { + // test Int + int[] testInt = new int[] {-1, 1, -2, 10000, -100000, 2, Integer.MAX_VALUE, Integer.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testInt.length; i++) { + valueWrappers.add(new OrginValueWrapper<>(i, testInt[i])); + convertResultWrappers.add(new ConvertResultWrapper<>(i, ZOrderingUtil.intTo8Byte(testInt[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testInt.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testLongConvert() { + // test Long + long[] testLong = new long[] {-1L, 1L, -2L, 10000L, -100000L, 2L, Long.MAX_VALUE, Long.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testLong.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((long)i, testLong[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((long)i, ZOrderingUtil.longTo8Byte(testLong[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testLong.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testDoubleConvert() { + // test Long + double[] testDouble = new double[] {-1.00d, 1.05d, -2.3d, 10000.002d, -100000.7d, 2.9d, Double.MAX_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testDouble.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((Double)(i * 1.0), testDouble[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((Double)(i * 1.0), ZOrderingUtil.doubleTo8Byte(testDouble[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testDouble.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testFloatConvert() { + // test Long + float[] testDouble = new float[] {-1.00f, 1.05f, -2.3f, 10000.002f, -100000.7f, 2.9f, Float.MAX_VALUE, Float.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testDouble.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((float)(i * 1.0), testDouble[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((float)(i * 1.0), ZOrderingUtil.doubleTo8Byte((double) testDouble[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testDouble.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + private class ConvertResultWrapper { + T index; + byte[] result; + public ConvertResultWrapper(T index, byte[] result) { + this.index = index; + this.result = result; + } + } + + private class OrginValueWrapper { + T index; + T originValue; + public OrginValueWrapper(T index, T originValue) { + this.index = index; + this.originValue = originValue; + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java index ae0ced2c819ff..8c2089963717c 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java @@ -32,6 +32,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; @@ -232,6 +233,11 @@ public HoodieWriteMetadata deletePartitions(HoodieEngineContext context, String throw new HoodieNotSupportedException("DeletePartitions is not supported yet"); } + @Override + public void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation) { + throw new HoodieNotSupportedException("update statistics is not supported yet"); + } + @Override public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, List> preppedRecords) { throw new HoodieNotSupportedException("This method should not be invoked"); diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java index 9d96ca1de99c4..ba3af89418051 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java @@ -32,6 +32,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; @@ -144,6 +145,11 @@ public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineC context, config, this, instantTime, records).execute(); } + @Override + public void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation) { + throw new HoodieNotSupportedException("update statistics is not supported yet"); + } + @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java index 4100b0463e026..0dedec6486f1f 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java @@ -380,6 +380,12 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, JavaRDD performClusteringWithRecordsRDD(final JavaRDD> getPartitioner(Map strategyParams, Schema schema) { - if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { + if (getWriteConfig().getSpaceFillingCurveDataOptimizeEnable()) { + // sort input records by z-order/hilbert + return Option.of(new RDDSpatialCurveOptimizationSortPartitioner((HoodieSparkEngineContext) getEngineContext(), + getWriteConfig(), HoodieAvroUtils.addMetadataFields(schema))); + } else if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { return Option.of(new RDDCustomColumnsSortPartitioner(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","), HoodieAvroUtils.addMetadataFields(schema))); } else { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java new file mode 100644 index 0000000000000..ddea51bd006d6 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.Zoptimize$; + +/** + * A partitioner that does spartial curve optimization sorting based on specified column values for each RDD partition. + * support z-curve optimization, hilbert will come soon. + * @param HoodieRecordPayload type + */ +public class RDDSpatialCurveOptimizationSortPartitioner + implements BulkInsertPartitioner>> { + private final HoodieSparkEngineContext sparkEngineContext; + private final SerializableSchema serializableSchema; + private final HoodieWriteConfig config; + + public RDDSpatialCurveOptimizationSortPartitioner(HoodieSparkEngineContext sparkEngineContext, HoodieWriteConfig config, Schema schema) { + this.sparkEngineContext = sparkEngineContext; + this.config = config; + this.serializableSchema = new SerializableSchema(schema); + } + + @Override + public JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions) { + String payloadClass = config.getPayloadClass(); + // do sort + JavaRDD preparedRecord = prepareGenericRecord(records, outputSparkPartitions, serializableSchema.get()); + return preparedRecord.map(record -> { + String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieKey hoodieKey = new HoodieKey(key, partition); + HoodieRecordPayload avroPayload = ReflectionUtils.loadPayload(payloadClass, + new Object[] {Option.of(record)}, Option.class); + HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, avroPayload); + return hoodieRecord; + }); + } + + private JavaRDD prepareGenericRecord(JavaRDD> inputRecords, final int numOutputGroups, final Schema schema) { + SerializableSchema serializableSchema = new SerializableSchema(schema); + JavaRDD genericRecordJavaRDD = inputRecords.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get()); + Dataset originDF = AvroConversionUtils.createDataFrame(genericRecordJavaRDD.rdd(), schema.toString(), sparkEngineContext.getSqlContext().sparkSession()); + Dataset zDataFrame; + + switch (config.getOptimizeBuildCurveMethod()) { + case DIRECT: + zDataFrame = Zoptimize$.MODULE$.createZIndexedDataFrameByMapValue(originDF, config.getClusteringSortColumns(), numOutputGroups); + break; + case SAMPLE: + zDataFrame = Zoptimize$.MODULE$.createZIndexedDataFrameBySample(originDF, config.getClusteringSortColumns(), numOutputGroups); + break; + default: + throw new HoodieException("Not a valid build curve method for doWriteOperation: "); + } + return HoodieSparkUtils.createRdd(zDataFrame, schema.getName(), + schema.getNamespace(), false, org.apache.hudi.common.util.Option.empty()).toJavaRDD(); + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index e458d845a817f..e66c5614d8d77 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -18,6 +18,7 @@ package org.apache.hudi.table; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieClusteringPlan; @@ -34,6 +35,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -72,12 +74,15 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Zoptimize$; +import scala.collection.JavaConversions; import java.io.IOException; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; /** * Implementation of a very heavily read-optimized Hoodie Table where, all data is stored in base files, with @@ -152,6 +157,32 @@ public HoodieWriteMetadata> insertOverwriteTable(HoodieEngi return new SparkInsertOverwriteTableCommitActionExecutor(context, config, this, instantTime, records).execute(); } + @Override + public void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation) { + // deal with z-order/hilbert statistic info + if (isOptimizeOperation) { + updateOptimizeOperationStatistics(context, stats, instantTime); + } + } + + private void updateOptimizeOperationStatistics(HoodieEngineContext context, List stats, String instantTime) { + String cols = config.getClusteringSortColumns(); + String basePath = metaClient.getBasePath(); + String indexPath = metaClient.getZindexPath(); + List validateCommits = metaClient.getCommitsTimeline() + .filterCompletedInstants().getInstants().map(f -> f.getTimestamp()).collect(Collectors.toList()); + List touchFiles = stats.stream().map(s -> new Path(basePath, s.getPath()).toString()).collect(Collectors.toList()); + if (touchFiles.isEmpty() || cols.isEmpty() || indexPath.isEmpty()) { + LOG.warn("save nothing to index table"); + return; + } + HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext)context; + Zoptimize$.MODULE$.saveStatisticsInfo(sparkEngineContext + .getSqlContext().sparkSession().read().load(JavaConversions.asScalaBuffer(touchFiles)), + cols, indexPath, instantTime, JavaConversions.asScalaBuffer(validateCommits)); + LOG.info(String.format("save statistic info sucessfully at commitTime: %s", instantTime)); + } + @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, Option> extraMetadata) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index 322d19194ae81..9013901c9a2ee 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -69,7 +69,7 @@ public HoodieWriteMetadata> bulkInsert(final JavaRDD writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), false); diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala new file mode 100644 index 0000000000000..e4623b443a273 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala @@ -0,0 +1,830 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.sql.Date +import java.util.concurrent.{Executors, ThreadPoolExecutor} + +import com.google.common.util.concurrent.ThreadFactoryBuilder +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.config.HoodieClusteringConfig +import org.apache.parquet.hadoop.ParquetFileReader +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Ascending, Attribute, AttributeReference, BoundReference, EqualNullSafe, EqualTo, Expression, ExtractValue, GetStructField, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Literal, Not, Or, SortOrder, StartsWith, UnsafeProjection} +import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.functions._ +import org.apache.hudi.optimize.ZOrderingUtil +import org.apache.spark.sql.hudi.execution._ +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types._ +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.util.{MutablePair, SerializableConfiguration} + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.concurrent.duration._ +import scala.concurrent.{ExecutionContext, Future} + +object Zoptimize { + + case class FileStats(val minVal: String, val maxVal: String, val num_nulls: Int = 0) + case class ColumnFileStats(val fileName: String, val colName: String, val minVal: String, val maxVal: String, val num_nulls: Int = 0) + + def createZIndexedDataFrameBySample(df: DataFrame, zCols: String, fileNum: Int): DataFrame = { + if (zCols == null || zCols.isEmpty) { + df + } else { + createZIndexedDataFrameBySample(df, zCols.split(",").map(_.trim), fileNum) + } + } + + /** + * create z-order DataFrame by sample + * first, sample origin data to get z-cols bounds, then create z-order DataFrame + * support all type data. + * this method need more resource and cost more time than createZIndexedDataFrameByMapValue + */ + def createZIndexedDataFrameBySample(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val spark = df.sparkSession + val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap + val fieldNum = df.schema.fields.length + val checkCols = zCols.filter(col => columnsMap(col) != null) + + if (zCols.isEmpty || checkCols.isEmpty) { + df + } else { + val zFields = zCols.map { col => + val newCol = columnsMap(col) + if (newCol == null) { + (-1, null) + } else { + newCol.dataType match { + case LongType | DoubleType | FloatType | StringType | IntegerType | DateType | TimestampType | ShortType | ByteType => + (df.schema.fields.indexOf(newCol), newCol) + case d: DecimalType => + (df.schema.fields.indexOf(newCol), newCol) + case _ => + (-1, null) + } + } + }.filter(_._1 != -1) + // Complex type found, use createZIndexedDataFrameByRange + if (zFields.length != zCols.length) { + return createZIndexedDataFrameByRange(df, zCols, fieldNum) + } + + val rawRdd = df.rdd + val sampleRdd = rawRdd.map { row => + val values = zFields.map { case (index, field) => + field.dataType match { + case LongType => + if (row.isNullAt(index)) Long.MaxValue else row.getLong(index) + case DoubleType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getDouble(index)) + case IntegerType => + if (row.isNullAt(index)) Long.MaxValue else row.getInt(index).toLong + case FloatType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble) + case StringType => + if (row.isNullAt(index)) "" else row.getString(index) + case DateType => + if (row.isNullAt(index)) Long.MaxValue else row.getDate(index).getTime + case TimestampType => + if (row.isNullAt(index)) Long.MaxValue else row.getTimestamp(index).getTime + case ByteType => + if (row.isNullAt(index)) Long.MaxValue else row.getByte(index).toLong + case ShortType => + if (row.isNullAt(index)) Long.MaxValue else row.getShort(index).toLong + case d: DecimalType => + if (row.isNullAt(index)) Long.MaxValue else row.getDecimal(index).longValue() + case _ => + null + } + }.filter(v => v != null).toArray + (values, null) + } + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.key, + HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.defaultValue.toString).toInt + val sample = new RangeSample(zOrderBounds, sampleRdd) + val rangeBounds = sample.getRangeBounds() + val sampleBounds = { + val candidateColNumber = rangeBounds.head._1.length + (0 to candidateColNumber - 1).map { i => + val colRangeBound = rangeBounds.map(x => (x._1(i), x._2)) + + if (colRangeBound.head._1.isInstanceOf[String]) { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(String, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[String]) + } else { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(Long, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[Long]) + } + } + } + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size + val newBound = new Array[Double](bound.length * fillFactor) + if (bound.isInstanceOf[Array[Long]] && fillFactor > 1) { + val longBound = bound.asInstanceOf[Array[Long]] + for (i <- 0 to bound.length - 1) { + for (j <- 0 to fillFactor - 1) { + // sample factor shoud not be too large, so it's ok to use 1 / fillfactor as slice + newBound(j + i*(fillFactor)) = longBound(i) + (j + 1) * (1 / fillFactor.toDouble) + } + } + (newBound, fillFactor) + } else { + (bound, 0) + } + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = rawRdd.mapPartitions { iter => + val expandBoundsWithFactor = boundBroadCast.value + val maxBoundNum = expandBoundsWithFactor.map(_._1.length).max + val longDecisionBound = new RawDecisionBound(Ordering[Long]) + val doubleDecisionBound = new RawDecisionBound(Ordering[Double]) + val stringDecisionBound = new RawDecisionBound(Ordering[String]) + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + + def getRank(rawIndex: Int, value: Long, isNull: Boolean): Int = { + val (expandBound, factor) = expandBoundsWithFactor(rawIndex) + if (isNull) { + expandBound.length + 1 + } else { + if (factor > 1) { + doubleDecisionBound.getBound(value + (threadLocalRandom.nextInt(factor) + 1)*(1 / factor.toDouble), expandBound.asInstanceOf[Array[Double]]) + } else { + longDecisionBound.getBound(value, expandBound.asInstanceOf[Array[Long]]) + } + } + } + + iter.map { row => + val values = zFields.zipWithIndex.map { case ((index, field), rawIndex) => + field.dataType match { + case LongType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getLong(index), isNull) + case DoubleType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getDouble(index)), isNull) + case IntegerType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getInt(index).toLong, isNull) + case FloatType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble), isNull) + case StringType => + val factor = maxBoundNum.toDouble / expandBoundsWithFactor(rawIndex)._1.length + if (row.isNullAt(index)) { + maxBoundNum + 1 + } else { + val currentRank = stringDecisionBound.getBound(row.getString(index), expandBoundsWithFactor(rawIndex)._1.asInstanceOf[Array[String]]) + if (factor > 1) { + (currentRank*factor).toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + currentRank + } + } + case DateType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDate(index).getTime, isNull) + case TimestampType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getTimestamp(index).getTime, isNull) + case ByteType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getByte(index).toLong, isNull) + case ShortType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getShort(index).toLong, isNull) + case d: DecimalType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDecimal(index).longValue(), isNull) + case _ => + -1 + } + }.filter(v => v != -1).map(ZOrderingUtil.intTo8Byte(_)).toArray + val zValues = ZOrderingUtil.interleaving(values, 8) + Row.fromSeq(row.toSeq ++ Seq(zValues)) + } + }.sortBy(x => ZorderingBinarySort(x.getAs[Array[Byte]](fieldNum)), numPartitions = fileNum) + val newDF = df.sparkSession.createDataFrame(indexRdd, StructType( + df.schema.fields ++ Seq( + StructField(s"zindex", + BinaryType, false)) + )) + newDF.drop("zindex") + } + } + + /** + * create z-order DataFrame by sample + * support all col types + */ + def createZIndexedDataFrameByRange(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val spark = df.sparkSession + val internalRdd = df.queryExecution.toRdd + val schema = df.schema + val outputAttributes = df.queryExecution.analyzed.output + val sortingExpressions = outputAttributes.filter(p => zCols.contains(p.name)) + if (sortingExpressions.length == 0 || sortingExpressions.length != zCols.size) { + df + } else { + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.key, + HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.defaultValue.toString).toInt + + val sampleRdd = internalRdd.mapPartitionsInternal { iter => + val projection = UnsafeProjection.create(sortingExpressions, outputAttributes) + val mutablePair = new MutablePair[InternalRow, Null]() + // Internally, RangePartitioner runs a job on the RDD that samples keys to compute + // partition bounds. To get accurate samples, we need to copy the mutable keys. + iter.map(row => mutablePair.update(projection(row).copy(), null)) + } + + val orderings = sortingExpressions.map(SortOrder(_, Ascending)).zipWithIndex.map { case (ord, i) => + ord.copy(child = BoundReference(i, ord.dataType, ord.nullable)) + } + + val lazyGeneratedOrderings = orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + val sample = new RangeSample(zOrderBounds, sampleRdd) + + val rangeBounds = sample.getRangeBounds() + + implicit val ordering1 = lazyGeneratedOrderings(0) + + val sampleBounds = sample.determineRowBounds(rangeBounds, math.min(zOrderBounds, rangeBounds.length), lazyGeneratedOrderings, sortingExpressions) + + val origin_orderings = sortingExpressions.map(SortOrder(_, Ascending)).map { ord => + ord.copy(child = BoundReference(0, ord.dataType, ord.nullable)) + } + + val origin_lazyGeneratedOrderings = origin_orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size.toDouble + (bound, fillFactor) + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = internalRdd.mapPartitionsInternal { iter => + val boundsWithFactor = boundBroadCast.value + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + val maxBoundNum = boundsWithFactor.map(_._1.length).max + val origin_Projections = sortingExpressions.map { se => + UnsafeProjection.create(Seq(se), outputAttributes) + } + + iter.map { unsafeRow => + val interleaveValues = origin_Projections.zip(origin_lazyGeneratedOrderings).zipWithIndex.map { case ((rowProject, lazyOrdering), index) => + val row = rowProject(unsafeRow) + val decisionBound = new RawDecisionBound(lazyOrdering) + if (row.isNullAt(0)) { + maxBoundNum + 1 + } else { + val (bound, factor) = boundsWithFactor(index) + if (factor > 1) { + val currentRank = decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + currentRank*factor.toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + } + } + }.toArray.map(ZOrderingUtil.intTo8Byte(_)) + val zValues = ZOrderingUtil.interleaving(interleaveValues, 8) + val mutablePair = new MutablePair[InternalRow, Array[Byte]]() + + mutablePair.update(unsafeRow, zValues) + } + }.sortBy(x => ZorderingBinarySort(x._2), numPartitions = fileNum).map(_._1) + spark.internalCreateDataFrame(indexRdd, schema) + } + } + + def getMinMaxValueSpark(df: DataFrame, cols: Seq[String]): DataFrame = { + val sqlContext = df.sparkSession.sqlContext + import sqlContext.implicits._ + + val values = cols.flatMap(c => Seq( min(col(c)).as(c + "_minValue"), max(col(c)).as(c + "_maxValue"), count(c).as(c + "_noNullCount"))) + val valueCounts = count("*").as("totalNum") + val projectValues = Seq(col("file")) ++ cols.flatMap(c => + Seq(col(c + "_minValue"), col(c + "_maxValue"), expr(s"totalNum - ${c + "_noNullCount"}").as(c + "_num_nulls"))) + + val result = df.select(input_file_name() as "file", col("*")) + .groupBy($"file") + .agg(valueCounts, values: _*).select(projectValues:_*) + result + } + + def getMinMaxValue(df: DataFrame, zCols: String): DataFrame = { + + val rawCols = zCols.split(",").map(_.trim) + + val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap + + val cols = rawCols.filter { col => + if (columnsMap.contains(col)) { + columnsMap(col).dataType match { + case IntegerType | DoubleType | StringType | DateType | LongType | FloatType | ShortType => + true + case a: DecimalType => + true + case other => + false + } + } else { + false + } + } + + if (cols.size != rawCols.size) return getMinMaxValueSpark(df, rawCols) + + val inputFiles = df.inputFiles + val conf = df.sparkSession.sparkContext.hadoopConfiguration + + val startTime = System.nanoTime() + + val allMetaData: Array[ColumnFileStats] = if (inputFiles.length < 10) { + + val listParallelism = math.min(Runtime.getRuntime.availableProcessors()/2 + 1, inputFiles.length) + + val slicedInputFiles = inputFiles.grouped(listParallelism) + + val threadPool = { + val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat("columnStatics" + "-%d").build() + Executors.newFixedThreadPool(listParallelism, threadFactory).asInstanceOf[ThreadPoolExecutor] + } + + try { + implicit val executionContext = ExecutionContext.fromExecutor(threadPool) + val staticTasks = slicedInputFiles.map { paths => + Future { + paths.map(new Path(_)).flatMap { filePath => + val blocks = ParquetFileReader.readFooter(conf, filePath).getBlocks().asScala + blocks.flatMap(b => b.getColumns().asScala. + map(col => (col.getPath().toDotString(), + FileStats(col.getStatistics().minAsString(), col.getStatistics().maxAsString(), col.getStatistics.getNumNulls.toInt)))) + .groupBy(x => x._1).mapValues(v => v.map(vv => vv._2)). + mapValues(value => FileStats(value.map(_.minVal).min, value.map(_.maxVal).max, value.map(_.num_nulls).max)).toSeq. + map(x => ColumnFileStats(filePath.getName(), x._1, x._2.minVal, x._2.maxVal, x._2.num_nulls)) + }.filter(p => cols.contains(p.colName)) + } + } + + val futureResult = try { + val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait] + Future.sequence(staticTasks).result(Duration.Inf)(awaitPermission) + } catch { + case e: Throwable => + throw e + } + futureResult.flatMap(x => x).toArray + } finally { + threadPool.shutdown() + } + } else { + val sc = df.sparkSession.sparkContext + val serializableConfiguration = new SerializableConfiguration(conf) + val numParallelism = inputFiles.size/3 + val previousJobDescription = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION) + try { + val description = s"Listing parquet column statistics" + sc.setJobDescription(description) + sc.parallelize(inputFiles, numParallelism).mapPartitions { paths => + val hadoopConf = serializableConfiguration.value + paths.map(new Path(_)).flatMap { filePath => + val blocks = ParquetFileReader.readFooter(hadoopConf, filePath).getBlocks().asScala + blocks.flatMap(b => b.getColumns().asScala. + map(col => (col.getPath().toDotString(), + FileStats(col.getStatistics().minAsString(), col.getStatistics().maxAsString(), col.getStatistics.getNumNulls.toInt)))) + .groupBy(x => x._1).mapValues(v => v.map(vv => vv._2)). + mapValues(value => FileStats(value.map(_.minVal).min, value.map(_.maxVal).max, value.map(_.num_nulls).max)).toSeq. + map(x => ColumnFileStats(filePath.getName(), x._1, x._2.minVal, x._2.maxVal, x._2.num_nulls)) + }.filter(p => cols.contains(p.colName)) + }.collect() + } finally { + sc.setJobDescription(previousJobDescription) + } + } + + val allMetaDataRDD = df.sparkSession.sparkContext.parallelize(allMetaData.groupBy(x => x.fileName).mapValues { css => + val size = css.length + if (size == 0) { + null + } else { + val rows = new ArrayBuffer[Any]() + rows.append(css.head.fileName) + cols.foreach { col => + val cs = css.find(p => p.colName.equals(col)).get + columnsMap(cs.colName).dataType match { + case IntegerType => + rows.append(cs.minVal.toInt) + rows.append(cs.maxVal.toInt) + case DoubleType => + rows.append(cs.minVal.toDouble) + rows.append(cs.maxVal.toDouble) + case StringType => + rows.append(cs.minVal) + rows.append(cs.maxVal) + case a: DecimalType => + rows.append(BigDecimal(cs.minVal)) + rows.append(BigDecimal(cs.maxVal)) + case DateType => + rows.append(Date.valueOf(cs.minVal)) + rows.append(Date.valueOf(cs.maxVal)) + case LongType => + rows.append(cs.minVal.toLong) + rows.append(cs.maxVal.toLong) + case ShortType => + rows.append(cs.minVal.toShort) + rows.append(cs.maxVal.toShort) + case FloatType => + rows.append(cs.minVal.toFloat) + rows.append(cs.maxVal.toFloat) + } + rows.append(cs.num_nulls) + } + Row.fromSeq(rows) + } + }.map(_._2).filter(x => x != null).toSeq, 1) + + val allMetaDataSchema = { + val neededFields = mutable.ListBuffer[StructField]() + neededFields.append(new StructField("file", StringType, false)) + cols.foreach { col => + neededFields.append(columnsMap(col).copy(name = col + "_minValue")) + neededFields.append(columnsMap(col).copy(name = col + "_maxValue")) + neededFields.append(new StructField( col + "_num_nulls", IntegerType, true)) + } + StructType(neededFields) + } + + val metaDF = df.sparkSession.createDataFrame(allMetaDataRDD, allMetaDataSchema) + metaDF + } + + def createZIndexedDataFrameByMapValue(df: DataFrame, zCols: String, fileNum: Int): DataFrame = { + if (zCols == null || zCols.isEmpty) { + df + } else { + createZIndexedDataFrameByMapValue(df, zCols.split(",").map(_.trim), fileNum) + } + } + + /** + * create z-order DataFrame directly + * first, map all base type data to byte[8], then create z-order DataFrame + * only support base type data. long,int,short,double,float,string,timestamp,decimal,date,byte + * this method is more effective than createZIndexDataFrameBySample + */ + def createZIndexedDataFrameByMapValue(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap + val fieldNum = df.schema.fields.length + + val checkCols = zCols.filter( col => columnsMap(col) != null) + + if (zCols.length ==0 && checkCols.size != zCols.size) { + df + } else { + val zFields = zCols.map { col => + val newCol = columnsMap(col) + (df.schema.fields.indexOf(newCol), newCol) + } + + val newRDD = df.rdd.map { row => + val values = zFields.map { case (index, field) => + field.dataType match { + case LongType => + ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getLong(index)) + case DoubleType => + ZOrderingUtil.doubleTo8Byte(if (row.isNullAt(index)) Double.MaxValue else row.getDouble(index)) + case IntegerType => + ZOrderingUtil.intTo8Byte(if (row.isNullAt(index)) Int.MaxValue else row.getInt(index)) + case FloatType => + ZOrderingUtil.doubleTo8Byte(if (row.isNullAt(index)) Float.MaxValue else row.getFloat(index).toDouble) + case StringType => + ZOrderingUtil.utf8To8Byte(if (row.isNullAt(index)) "" else row.getString(index)) + case DateType => + ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getDate(index).getTime) + case TimestampType => + ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getTimestamp(index).getTime) + case ByteType => + ZOrderingUtil.byteTo8Byte(if (row.isNullAt(index)) Byte.MaxValue else row.getByte(index)) + case ShortType => + ZOrderingUtil.intTo8Byte(if (row.isNullAt(index)) Short.MaxValue else row.getShort(index).toInt) + case d: DecimalType => + ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getDecimal(index).longValue()) + case _ => + null + } + }.filter(v => v != null).toArray + val zValues = ZOrderingUtil.interleaving(values, 8) + Row.fromSeq(row.toSeq ++ Seq(zValues)) + }.sortBy(x => ZorderingBinarySort(x.getAs[Array[Byte]](fieldNum)), numPartitions = fileNum) + + val newDF = df.sparkSession.createDataFrame(newRDD, StructType( + df.schema.fields ++ Seq( + StructField(s"zindex", + BinaryType, false)) + )) + newDF.drop("zindex") + } + } + + /** + * create z_index filter and push those filters to index table to filter all candidate scan files. + * @param condition origin filter from query. + * @param indexSchema schema from index table. + * @return filters for index table. + */ + def createZindexFilter(condition: Expression, indexSchema: StructType): Expression = { + def buildExpressionInternal(colName: Seq[String], statisticValue: String): Expression = { + val appendColName = UnresolvedAttribute(colName).name + statisticValue + col(appendColName).expr + } + + def reWriteCondition(colName: Seq[String], conditionExpress: Expression): Expression = { + val appendColName = UnresolvedAttribute(colName).name + "_minValue" + if (indexSchema.exists(p => p.name == appendColName)) { + conditionExpress + } else { + Literal.TrueLiteral + } + } + + val minValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_minValue") + val maxValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_maxValue") + val num_nulls = (colName: Seq[String]) => buildExpressionInternal(colName, "_num_nulls") + + condition match { + // query filter "colA = b" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table + case EqualTo(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) + // query filter "b = colA" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table + case EqualTo(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) + // query filter "colA = null" convert it to "colA_num_nulls = null" for index table + case equalNullSafe @ EqualNullSafe(_: AttributeReference, _ @ Literal(null, _)) => + val colName = getTargetColNameParts(equalNullSafe.left) + reWriteCondition(colName, EqualTo(num_nulls(colName), equalNullSafe.right)) + // query filter "colA < b" convert it to "colA_minValue < b" for index table + case LessThan(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName,LessThan(minValue(colName), value)) + // query filter "b < colA" convert it to "colA_maxValue > b" for index table + case LessThan(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(maxValue(colName), value)) + // query filter "colA > b" convert it to "colA_maxValue > b" for index table + case GreaterThan(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(maxValue(colName), value)) + // query filter "b > colA" convert it to "colA_minValue < b" for index table + case GreaterThan(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThan(minValue(colName), value)) + // query filter "colA <= b" convert it to "colA_minValue <= b" for index table + case LessThanOrEqual(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) + // query filter "b <= colA" convert it to "colA_maxValue >= b" for index table + case LessThanOrEqual(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThanOrEqual(maxValue(colName), value)) + // query filter "colA >= b" convert it to "colA_maxValue >= b" for index table + case GreaterThanOrEqual(attribute: AttributeReference, right: Literal) => + val colName = getTargetColNameParts(attribute) + GreaterThanOrEqual(maxValue(colName), right) + // query filter "b >= colA" convert it to "colA_minValue <= b" for index table + case GreaterThanOrEqual(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) + // query filter "colA is null" convert it to "colA_num_nulls > 0" for index table + case IsNull(attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(num_nulls(colName), Literal(0))) + // query filter "colA is not null" convert it to "colA_num_nulls = 0" for index table + case IsNotNull(attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, EqualTo(num_nulls(colName), Literal(0))) + // query filter "colA in (a,b)" convert it to " (colA_minValue <= a and colA_maxValue >= a) or (colA_minValue <= b and colA_maxValue >= b) " for index table + case In(attribute: AttributeReference, list: Seq[Literal]) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, list.map { lit => + And(LessThanOrEqual(minValue(colName), lit), GreaterThanOrEqual(maxValue(colName), lit)) + }.reduce(Or)) + // query filter "colA like xxx" convert it to " (colA_minValue <= xxx and colA_maxValue >= xxx) or (colA_min start with xxx or colA_max start with xxx) " for index table + case StartsWith(attribute, v @ Literal(_: UTF8String, _)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Or(And(LessThanOrEqual(minValue(colName), v), GreaterThanOrEqual(maxValue(colName), v)) , + Or(StartsWith(minValue(colName), v), StartsWith(maxValue(colName), v)))) + // query filter "colA not in (a, b)" convert it to " (not( colA_minValue = a and colA_maxValue = a)) and (not( colA_minValue = b and colA_maxValue = b)) " for index table + case Not(In(attribute: AttributeReference, list: Seq[Literal])) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, list.map { lit => + Not(And(EqualTo(minValue(colName), lit), EqualTo(maxValue(colName), lit))) + }.reduce(And)) + // query filter "colA != b" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table + case Not(EqualTo(attribute: AttributeReference, value: Literal)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) + // query filter "b != colA" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table + case Not(EqualTo(value: Literal, attribute: AttributeReference)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) + // query filter "colA not like xxxx" convert it to "not ( colA_minValue startWith xxx and colA_maxValue startWith xxx)" for index table + case Not(StartsWith(attribute, value @ Literal(_: UTF8String, _))) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(StartsWith(minValue(colName), value), StartsWith(maxValue(colName), value)))) + case or: Or => + val resLeft = createZindexFilter(or.left, indexSchema) + val resRight = createZindexFilter(or.right, indexSchema) + Or(resLeft, resRight) + + case and: And => + val resLeft = createZindexFilter(and.left, indexSchema) + val resRight = createZindexFilter(and.right, indexSchema) + And(resLeft, resRight) + + case expr: Expression => + Literal.TrueLiteral + } + } + + /** + * Extracts name from a resolved expression referring to a nested or non-nested column. + */ + def getTargetColNameParts(resolvedTargetCol: Expression): Seq[String] = { + resolvedTargetCol match { + case attr: Attribute => Seq(attr.name) + + case Alias(c, _) => getTargetColNameParts(c) + + case GetStructField(c, _, Some(name)) => getTargetColNameParts(c) :+ name + + case ex: ExtractValue => + throw new AnalysisException(s"convert reference to name failed, Updating nested fields is only supported for StructType: ${ex}.") + + case other => + throw new AnalysisException(s"convert reference to name failed, Found unsupported expression ${other}") + } + } + + def createDataFrameInternal( + spark: SparkSession, + catalystRows: RDD[InternalRow], + schema: StructType, + isStreaming: Boolean = false): DataFrame = { + spark.internalCreateDataFrame(catalystRows, schema, isStreaming) + } + + def getIndexFiles(conf: Configuration, indexPath: String): Seq[FileStatus] = { + val basePath = new Path(indexPath) + basePath.getFileSystem(conf) + .listStatus(basePath).filterNot(f => shouldFilterOutPathName(f.getPath.getName)) + } + + /** + * read parquet files concurrently by local. + * this method is mush faster than spark + */ + def readParquetFile(spark: SparkSession, indexFiles: Seq[FileStatus], filters: Seq[Filter] = Nil, schemaOpts: Option[StructType] = None): Set[String] = { + val hadoopConf = spark.sparkContext.hadoopConfiguration + val partitionedFiles = indexFiles.map(f => PartitionedFile(InternalRow.empty, f.getPath.toString, 0, f.getLen)) + + val requiredSchema = new StructType().add("file", StringType, true) + val schema = schemaOpts.getOrElse(requiredSchema) + val parquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(spark + , schema , StructType(Nil), requiredSchema, filters, Map.empty, hadoopConf) + val results = new Array[Iterator[String]](partitionedFiles.size) + partitionedFiles.zipWithIndex.par.foreach { case (pf, index) => + val fileIterator = parquetReader(pf).asInstanceOf[Iterator[Any]] + val rows = fileIterator.flatMap(_ match { + case r: InternalRow => Seq(r) + case b: ColumnarBatch => b.rowIterator().asScala + }).map(r => r.getString(0)) + results(index) = rows + } + results.flatMap(f => f).toSet + } + + def shouldFilterOutPathName(pathName: String): Boolean = { + // We filter follow paths: + // 1. everything that starts with _ and ., except _common_metadata and metadata + // because Parquet needs to find those metadata files from leaf files returned by this method. + // We should refactor this logic to not mix metadata files with data files. + // 2. everything that ends with ._COPYING_, because this is a intermediate state of file. we + // should skip this file in case of double reading. + val exclude = (pathName.startsWith("") && !pathName.contains("=")) || + pathName.startsWith(".") || pathName.endsWith(".COPYING") + val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata") + exclude && !include + } + + /** + * update statistics info. + * this method will update old index table by full out join, + * and save the updated table into a new index table based on commitTime. + * old index table will be cleaned also. + */ + def saveStatisticsInfo( + df: DataFrame, + cols: String, + indexPath: String, + commitTime: String, + validateCommits: Seq[String]): Unit = { + val savePath = new Path(indexPath, commitTime) + val spark = df.sparkSession + val fs = savePath.getFileSystem(spark.sparkContext.hadoopConfiguration) + val statisticsDF = getMinMaxValue(df, cols) + // try to find last validate index table from index path + if (fs.exists(new Path(indexPath))) { + // find all the indexTable from .hoodie/.index + val allIndexTables = fs.listStatus(new Path(indexPath)).filter(_.isDirectory) + .map(_.getPath.getName) + val candidateIndexTables = allIndexTables.filter(f => validateCommits.contains(f)).sortBy(x => x).toList + val residualTables = allIndexTables.filter(f => !validateCommits.contains(f)) + + val optIndexDf = if (candidateIndexTables.isEmpty) { + None + } else { + try { + Some(spark.read.load(new Path(indexPath, candidateIndexTables.last).toString)) + } catch { + case _: Throwable => + None + } + } + // clean old index table, keep at most 1 index table + candidateIndexTables.dropRight(1).foreach(f => fs.delete(new Path(indexPath, f))) + // clean residualTables + // retried cluster operations at the same instant time is also considered, + // the residual files produced by retried are cleaned up before save statistics + // save statistics info to index table which named commitTime + residualTables.foreach(f => fs.delete(new Path(indexPath, f))) + if (optIndexDf.isDefined && optIndexDf.get.schema.equals(statisticsDF.schema)) { + val originalTable = "indexTable_" + java.util.UUID.randomUUID().toString.replace("-", "") + val updateTable = "updateTable_" + java.util.UUID.randomUUID().toString.replace("-", "") + optIndexDf.get.registerTempTable(originalTable) + statisticsDF.registerTempTable(updateTable) + // update table by full out join + val cols = optIndexDf.get.schema.map(_.name) + spark.sql(createSql(originalTable, updateTable, cols)).repartition(1).write.save(savePath.toString) + } else { + statisticsDF.repartition(1).write.mode("overwrite").save(savePath.toString) + } + } else { + statisticsDF.repartition(1).write.mode("overwrite").save(savePath.toString) + } + + } + + private def createSql(leftTable: String, rightTable: String, cols: Seq[String]): String = { + var selectsql = "" + for (i <- (0 to cols.size-1)) { + selectsql = selectsql + s" if (${leftTable}.${cols(0)} is null, ${rightTable}.${cols(i)}, ${leftTable}.${cols(i)}) as ${cols(i)} ," + } + "select " + selectsql.dropRight(1) + s" from ${leftTable} full join ${rightTable} on ${leftTable}.${cols(0)} = ${rightTable}.${cols(0)}" + } +} + diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala new file mode 100644 index 0000000000000..7a35da4d85156 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.execution + +import java.util + +import org.apache.spark.rdd.{PartitionPruningRDD, RDD} +import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} +import org.apache.hudi.optimize.ZOrderingUtil +import org.apache.spark.util.random.SamplingUtils + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.reflect.{ClassTag, classTag} +import scala.util.hashing.byteswap32 + +class RangeSample[K: ClassTag, V]( + zEncodeNum: Int, + rdd: RDD[_ <: Product2[K, V]], + private var ascend: Boolean = true, + val samplePointsPerPartitionHint: Int = 20) extends Serializable { + + // We allow zEncodeNum = 0, which happens when sorting an empty RDD under the default settings. + require(zEncodeNum >= 0, s"Number of zEncodeNum cannot be negative but found $zEncodeNum.") + require(samplePointsPerPartitionHint > 0, + s"Sample points per partition must be greater than 0 but found $samplePointsPerPartitionHint") + + def getRangeBounds(): ArrayBuffer[(K, Float)] = { + if (zEncodeNum <= 1) { + ArrayBuffer.empty[(K, Float)] + } else { + // This is the sample size we need to have roughly balanced output partitions, capped at 1M. + // Cast to double to avoid overflowing ints or longs + val sampleSize = math.min(samplePointsPerPartitionHint.toDouble * zEncodeNum, 1e6) + // Assume the input partitions are roughly balanced and over-sample a little bit. + val sampleSizePerPartition = math.ceil(3.0 * sampleSize / rdd.partitions.length).toInt + val (numItems, sketched) = sketch(rdd.map(_._1), sampleSizePerPartition) + if (numItems == 0L) { + ArrayBuffer.empty[(K, Float)] + } else { + // If a partition contains much more than the average number of items, we re-sample from it + // to ensure that enough items are collected from that partition. + val fraction = math.min(sampleSize / math.max(numItems, 1L), 1.0) + val candidates = ArrayBuffer.empty[(K, Float)] + val imbalancedPartitions = mutable.Set.empty[Int] + + sketched.foreach { case (idx, n, sample) => + if (fraction * n > sampleSizePerPartition) { + imbalancedPartitions += idx + } else { + // The weight is 1 over the sampling probability. + val weight = (n.toDouble / sample.length).toFloat + for (key <- sample) { + candidates += ((key, weight)) + } + } + } + + if (imbalancedPartitions.nonEmpty) { + // Re-sample imbalanced partitions with the desired sampling probability. + val imbalanced = new PartitionPruningRDD(rdd.map(_._1), imbalancedPartitions.contains) + val seed = byteswap32(-rdd.id - 1) + val reSampled = imbalanced.sample(withReplacement = false, fraction, seed).collect() + val weight = (1.0 / fraction).toFloat + candidates ++= reSampled.map(x => (x, weight)) + } + candidates + } + } + } + + /** + * Determines the bounds for range partitioning from candidates with weights indicating how many + * items each represents. Usually this is 1 over the probability used to sample this candidate. + * + * @param candidates unordered candidates with weights + * @param partitions number of partitions + * @return selected bounds + */ + def determineBound[K : Ordering : ClassTag]( + candidates: ArrayBuffer[(K, Float)], + partitions: Int, ordering: Ordering[K]): Array[K] = { + val ordered = candidates.sortBy(_._1)(ordering) + val numCandidates = ordered.size + val sumWeights = ordered.map(_._2.toDouble).sum + val step = sumWeights / partitions + var cumWeight = 0.0 + var target = step + val bounds = ArrayBuffer.empty[K] + var i = 0 + var j = 0 + var previousBound = Option.empty[K] + while ((i < numCandidates) && (j < partitions - 1)) { + val (key, weight) = ordered(i) + cumWeight += weight + if (cumWeight >= target) { + // Skip duplicate values. + if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) { + bounds += key + target += step + j += 1 + previousBound = Some(key) + } + } + i += 1 + } + bounds.toArray + } + + def determineRowBounds[K : Ordering : ClassTag]( + candidates: ArrayBuffer[(K, Float)], + partitions: Int, orderings: Seq[Ordering[K]], + attributes: Seq[Attribute]): Array[Array[UnsafeRow]] = { + + orderings.zipWithIndex.map { case (ordering, index) => + val ordered = candidates.sortBy(_._1)(ordering) + val numCandidates = ordered.size + val sumWeights = ordered.map(_._2.toDouble).sum + val step = sumWeights / partitions + var cumWeight = 0.0 + var target = step + val bounds = ArrayBuffer.empty[K] + var i = 0 + var j = 0 + var previousBound = Option.empty[K] + while ((i < numCandidates) && (j < partitions - 1)) { + val (key, weight) = ordered(i) + cumWeight += weight + if (cumWeight >= target) { + // Skip duplicate values. + if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) { + bounds += key + target += step + j += 1 + previousBound = Some(key) + } + } + i += 1 + } + // build project + val project = UnsafeProjection.create(Seq(attributes(index)), attributes) + bounds.map { bound => + val row = bound.asInstanceOf[UnsafeRow] + project(row).copy() + }.toArray + }.toArray + } + + /** + * Sketches the input RDD via reservoir sampling on each partition. + * + * @param rdd the input RDD to sketch + * @param sampleSizePerPartition max sample size per partition + * @return (total number of items, an array of (partitionId, number of items, sample)) + */ + def sketch[K: ClassTag]( + rdd: RDD[K], + sampleSizePerPartition: Int): (Long, Array[(Int, Long, Array[K])]) = { + val shift = rdd.id + // val classTagK = classTag[K] // to avoid serializing the entire partitioner object + val sketched = rdd.mapPartitionsWithIndex { (idx, iter) => + val seed = byteswap32(idx ^ (shift << 16)) + val (sample, n) = SamplingUtils.reservoirSampleAndCount( + iter, sampleSizePerPartition, seed) + Iterator((idx, n, sample)) + }.collect() + val numItems = sketched.map(_._2).sum + (numItems, sketched) + } +} + +class RawDecisionBound[K : Ordering : ClassTag](ordering: Ordering[K]) extends Serializable { + + private var binarySearch: ((Array[K], K) => Int) = { + // For primitive keys, we can use the natural ordering. Otherwise, use the Ordering comparator. + classTag[K] match { + case ClassTag.Float => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Float]], x.asInstanceOf[Float]) + case ClassTag.Double => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Double]], x.asInstanceOf[Double]) + case ClassTag.Byte => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Byte]], x.asInstanceOf[Byte]) + case ClassTag.Char => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Char]], x.asInstanceOf[Char]) + case ClassTag.Short => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Short]], x.asInstanceOf[Short]) + case ClassTag.Int => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Int]], x.asInstanceOf[Int]) + case ClassTag.Long => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Long]], x.asInstanceOf[Long]) + case _ => + val comparator = ordering.asInstanceOf[java.util.Comparator[Any]] + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x, comparator) + } + } + + def getBound(key: Any, candidateBounds: Array[K]): Int = { + val k = key.asInstanceOf[K] + var bound = 0 + if (candidateBounds.length <= 128) { + while(bound < candidateBounds.length && ordering.gt(k, candidateBounds(bound))) { + bound += 1 + } + } else { + bound = binarySearch(candidateBounds, k) + if (bound < 0 ) { + bound = -bound - 1 + } + if (bound > candidateBounds.length) { + bound = candidateBounds.length + } + } + bound + } +} + +case class ZorderingBinarySort(b: Array[Byte]) extends Ordered[ZorderingBinarySort] with Serializable { + override def compare(that: ZorderingBinarySort): Int = { + val len = this.b.length + ZOrderingUtil.compareTo(this.b, 0, len, that.b, 0, len) + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index 2b94d7ff072a5..47f27b161fc83 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -79,6 +79,7 @@ public class HoodieTableMetaClient implements Serializable { public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux"; public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap"; public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat"; + public static final String ZINDEX_NAME = ".index"; public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR + ".partitions"; public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR @@ -176,6 +177,13 @@ public String getMetaPath() { return metaPath; } + /** + * @return z-index path + */ + public String getZindexPath() { + return new Path(metaPath, ZINDEX_NAME).toString(); + } + /** * @return Temp Folder path */ diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index af0c2cc11b026..90b99ba62e118 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -28,9 +28,9 @@ import org.apache.hudi.common.table.view.{FileSystemViewStorageConfig, HoodieTab import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.internal.Logging -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{SparkSession, Zoptimize, Column} import org.apache.spark.sql.avro.SchemaConverters -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BoundReference, Expression, InterpretedPredicate} +import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BoundReference, Expression, InterpretedPredicate} import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory} @@ -41,6 +41,7 @@ import org.apache.spark.unsafe.types.UTF8String import java.util.Properties import scala.collection.JavaConverters._ +import scala.collection.JavaConversions._ import scala.collection.mutable /** @@ -84,6 +85,12 @@ case class HoodieFileIndex( private val specifiedQueryInstant = options.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key) .map(HoodieSqlUtils.formatQueryInstant) + /** + * Get all completeCommits. + */ + lazy val completeCommits = metaClient.getCommitsTimeline() + .filterCompletedInstants().getInstants().iterator().toList.map(_.getTimestamp) + /** * Get the schema of the table. */ @@ -147,6 +154,43 @@ case class HoodieFileIndex( override def rootPaths: Seq[Path] = queryPath :: Nil + private def createFilterFiles(dataFilters: Seq[Expression]): Set[String] = { + var allFiles: Set[String] = Set.empty + var candidateFiles: Set[String] = Set.empty + val indexPath = metaClient.getZindexPath + val fs = metaClient.getFs + if (fs.exists(new Path(indexPath)) && dataFilters.nonEmpty) { + // try to load latest index table from index path + val candidateIndexTables = fs.listStatus(new Path(indexPath)).filter(_.isDirectory) + .map(_.getPath.getName).filter(f => completeCommits.contains(f)).sortBy(x => x) + if (candidateIndexTables.nonEmpty) { + val dataFrameOpt = try { + Some(spark.read.load(new Path(indexPath, candidateIndexTables.last).toString)) + } catch { + case _: Throwable => + logError("missing index skip data-skipping") + None + } + + if (dataFrameOpt.isDefined) { + val indexSchema = dataFrameOpt.get.schema + val indexFiles = Zoptimize.getIndexFiles(spark.sparkContext.hadoopConfiguration, indexPath) + val indexFilter = dataFilters.map(Zoptimize.createZindexFilter(_, indexSchema)).reduce(And) + logInfo(s"index filter condition: ${indexFilter}") + dataFrameOpt.get.persist() + if (indexFiles.size <= 4) { + allFiles = Zoptimize.readParquetFile(spark, indexFiles) + } else { + allFiles = dataFrameOpt.get.select("file").collect().map(_.getString(0)).toSet + } + candidateFiles = dataFrameOpt.get.filter(new Column(indexFilter)).select("file").collect().map(_.getString(0)).toSet + dataFrameOpt.get.unpersist() + } + } + } + allFiles -- candidateFiles + } + /** * Invoked by Spark to fetch list of latest base files per partition. * @@ -156,12 +200,25 @@ case class HoodieFileIndex( */ override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + // try to load filterFiles from index + val filterFiles: Set[String] = createFilterFiles(dataFilters) if (queryAsNonePartitionedTable) { // Read as Non-Partitioned table. - Seq(PartitionDirectory(InternalRow.empty, allFiles)) + val candidateFiles = if (!filterFiles.isEmpty) { + allFiles.filterNot(fileStatus => filterFiles.contains(fileStatus.getPath.getName)) + } else { + allFiles + } + logInfo(s"Total file size is: ${allFiles.size}," + + s" after file skip size is: ${candidateFiles.size} " + + s"skipping percent ${if (allFiles.length != 0) (allFiles.size - candidateFiles.size) / allFiles.size.toDouble else 0}") + Seq(PartitionDirectory(InternalRow.empty, candidateFiles)) } else { // Prune the partition path by the partition filters val prunedPartitions = prunePartition(cachedAllInputFileSlices.keys.toSeq, partitionFilters) - prunedPartitions.map { partition => + var totalFileSize = 0 + var candidateFileSize = 0 + + val result = prunedPartitions.map { partition => val baseFileStatuses = cachedAllInputFileSlices(partition).map(fileSlice => { if (fileSlice.getBaseFile.isPresent) { fileSlice.getBaseFile.get().getFileStatus @@ -169,9 +226,19 @@ case class HoodieFileIndex( null } }).filterNot(_ == null) - - PartitionDirectory(partition.values, baseFileStatuses) + val candidateFiles = if (!filterFiles.isEmpty) { + baseFileStatuses.filterNot(fileStatu => filterFiles.contains(fileStatu.getPath.getName)) + } else { + baseFileStatuses + } + totalFileSize += baseFileStatuses.size + candidateFileSize += candidateFiles.size + PartitionDirectory(partition.values, candidateFiles) } + logInfo(s"Total file size is: ${totalFileSize}," + + s" after file skip size is: ${candidateFileSize} " + + s"skipping percent ${if (allFiles.length != 0) (totalFileSize - candidateFileSize) / totalFileSize.toDouble else 0}") + result } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala new file mode 100644 index 0000000000000..583d0f7788d09 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.functional + +import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings +import org.apache.hudi.testutils.HoodieClientTestBase +import org.apache.spark.sql._ +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.{AfterEach, BeforeEach} +import org.junit.jupiter.params.ParameterizedTest +import org.junit.jupiter.params.provider.ValueSource + +import scala.collection.JavaConversions._ + +class TestOptimizeTable extends HoodieClientTestBase { + var spark: SparkSession = null + + val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + "hoodie.bulkinsert.shuffle.parallelism" -> "4", + DataSourceWriteOptions.RECORDKEY_FIELD.key() -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key() -> "partition", + DataSourceWriteOptions.PRECOMBINE_FIELD.key() -> "timestamp", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" + ) + + @BeforeEach override def setUp() { + initPath() + initSparkContexts() + spark = sqlContext.sparkSession + initTestDataGenerator() + initFileSystem() + } + + @AfterEach override def tearDown() = { + cleanupSparkContexts() + cleanupTestDataGenerator() + cleanupFileSystem() + } + + @ParameterizedTest + @ValueSource(strings = Array("COPY_ON_WRITE", "MERGE_ON_READ")) + def testOptimizewithClustering(tableType: String): Unit = { + // Bulk Insert Operation + val records1 = recordsToStrings(dataGen.generateInserts("001", 1000)).toList + val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2)) + inputDF1.write.format("org.apache.hudi") + .options(commonOpts) + .option("hoodie.compact.inline", "false") + .option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) + .option(DataSourceWriteOptions.TABLE_TYPE.key(), tableType) + // option for clustering + .option("hoodie.parquet.small.file.limit", "0") + .option("hoodie.clustering.inline", "true") + .option("hoodie.clustering.inline.max.commits", "1") + .option("hoodie.clustering.plan.strategy.target.file.max.bytes", "1073741824") + .option("hoodie.clustering.plan.strategy.small.file.limit", "629145600") + .option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString) + .option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 *1024 * 1024L)) + .option(HoodieClusteringConfig.SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE.key, "true") + .option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat, begin_lon") + .mode(SaveMode.Overwrite) + .save(basePath) + + assertEquals(1000, spark.read.format("hudi").load(basePath).count()) + } +} From 9528a6c908cda69bb7aeca6413dd556d3e260ae6 Mon Sep 17 00:00:00 2001 From: Vinoth Chandar Date: Thu, 28 Oct 2021 05:11:19 -0700 Subject: [PATCH 2/3] Renaming some configs for consistency/simplicity. --- .../hudi/config/HoodieClusteringConfig.java | 48 +- .../apache/hudi/config/HoodieWriteConfig.java | 20 +- .../apache/hudi/optimize/UnsafeAccess.java | 74 -- .../apache/hudi/optimize/ZOrderingUtil.java | 62 +- .../hudi/client/SparkRDDWriteClient.java | 4 +- .../SparkSortAndSizeExecutionStrategy.java | 2 +- ...atialCurveOptimizationSortPartitioner.java | 8 +- .../table/HoodieSparkCopyOnWriteTable.java | 6 +- .../apache/spark/ZCurveOptimizeHelper.java | 355 ++++++++ .../org/apache/hudi/HoodieSparkUtils.scala | 41 + .../org/apache/spark/sql/Zoptimize.scala | 830 ------------------ .../sql/hudi/execution/RangeSample.scala | 289 +++++- .../model/HoodieColumnRangeMetadata.java | 99 +++ .../apache/hudi/common/util/ParquetUtils.java | 58 ++ .../org/apache/hudi/DataSourceOptions.scala | 5 + .../org/apache/hudi/HoodieFileIndex.scala | 29 +- .../spark/sql/hudi/DataSkippingUtils.scala | 208 +++++ .../org/apache/hudi/TestOptimizeTable.scala | 69 +- 18 files changed, 1192 insertions(+), 1015 deletions(-) delete mode 100644 hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java create mode 100644 hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java delete mode 100644 hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java create mode 100644 hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java index 684ff1459a2ad..5fcd9dfd60be4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java @@ -43,7 +43,7 @@ public class HoodieClusteringConfig extends HoodieConfig { public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy."; // Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix - public static final String DATA_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize."; + public static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize."; public static final ConfigProperty DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions") @@ -142,17 +142,18 @@ public class HoodieClusteringConfig extends HoodieConfig { .sinceVersion("0.9.0") .withDocumentation("When rewriting data, preserves existing hoodie_commit_time"); - public static final ConfigProperty SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE = ConfigProperty - .key(DATA_OPTIMIZE_PARAM_PREFIX + "space.filling.curve.data.optimize.enable") + public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable") .defaultValue(false) .sinceVersion("0.10.0") - .withDocumentation("config to use z-ordering/space-filling curves to optimize the layout of table to boost query performance"); + .withDocumentation("Enable use z-ordering/space-filling curves to optimize the layout of table to boost query performance. " + + "This parameter takes precedence over clustering strategy set using " + EXECUTION_STRATEGY_CLASS_NAME.key()); - public static final ConfigProperty DATA_OPTIMIZE_STRATEGY = ConfigProperty - .key(DATA_OPTIMIZE_PARAM_PREFIX + "strategy") + public static final ConfigProperty LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") .defaultValue("z-order") .sinceVersion("0.10.0") - .withDocumentation("config to provide a way to optimize data layout for table, current only support z-order and hilbert"); + .withDocumentation("Type of layout optimization to be applied, current only supports `z-order` and `hilbert` curves."); /** * There exists two method to build z-curve. @@ -162,34 +163,33 @@ public class HoodieClusteringConfig extends HoodieConfig { * Refer to rfc-28 for specific algorithm flow. * Boundary-based Interleaved Index method has better generalization, but the build speed is slower than direct method. */ - public static final ConfigProperty DATA_OPTIMIZE_BUILD_CURVE_STRATEGY = ConfigProperty - .key(DATA_OPTIMIZE_PARAM_PREFIX + "build.curve.strategy") + public static final ConfigProperty LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.build.method") .defaultValue("direct") .sinceVersion("0.10.0") - .withDocumentation("Config to provide whether use direct/sample method to build curve optimize for data layout," - + "build curve_optimize by directly method is faster than by sample method, however sample method produce a better data layout." - + "now support two strategies: directly,sample"); + .withDocumentation("Controls how data is sampled to build the space filling curves. two methods: `direct`,`sample`." + + "The direct method is faster than the sampling, however sample method would produce a better data layout."); /** * Doing sample for table data is the first step in Boundary-based Interleaved Index method. * larger sample number means better optimize result, but more memory consumption */ - public static final ConfigProperty DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER = ConfigProperty - .key(DATA_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.number") + public static final ConfigProperty LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.size") .defaultValue("200000") .sinceVersion("0.10.0") - .withDocumentation("when set" + DATA_OPTIMIZE_BUILD_CURVE_STRATEGY.key() + " to sample method, sample number need to be set for it." - + " larger number means better layout result, but more memory consumer"); + .withDocumentation("when setting" + LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD.key() + " to `sample`, the amount of sampling to be done." + + "Large sample size leads to better results, at the expense of more memory usage."); /** * The best way to use Z-order/Space-filling curves is to cooperate with Data-Skipping * with data-skipping query engine can greatly reduce the number of table files to be read. * otherwise query engine can only do row-group skipping for files (parquet/orc) */ - public static final ConfigProperty DATA_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty - .key(DATA_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable") + public static final ConfigProperty LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable") .defaultValue(true) .sinceVersion("0.10.0") - .withDocumentation("enable dataSkipping for hudi, when optimize finished, statistics will be collected which used for dataSkipping"); + .withDocumentation("Enable data skipping by collecting statistics once layout optimization is complete."); /** * @deprecated Use {@link #PLAN_STRATEGY_CLASS_NAME} and its methods instead @@ -405,27 +405,27 @@ public Builder withPreserveHoodieCommitMetadata(Boolean preserveHoodieCommitMeta } public Builder withSpaceFillingCurveDataOptimizeEnable(Boolean enable) { - clusteringConfig.setValue(SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE, String.valueOf(enable)); + clusteringConfig.setValue(LAYOUT_OPTIMIZE_ENABLE, String.valueOf(enable)); return this; } public Builder withDataOptimizeStrategy(String strategy) { - clusteringConfig.setValue(DATA_OPTIMIZE_STRATEGY, strategy); + clusteringConfig.setValue(LAYOUT_OPTIMIZE_STRATEGY, strategy); return this; } public Builder withDataOptimizeBuildCurveStrategy(String method) { - clusteringConfig.setValue(DATA_OPTIMIZE_BUILD_CURVE_STRATEGY, method); + clusteringConfig.setValue(LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD, method); return this; } public Builder withDataOptimizeBuildCurveSampleNumber(int sampleNumber) { - clusteringConfig.setValue(DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER, String.valueOf(sampleNumber)); + clusteringConfig.setValue(LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE, String.valueOf(sampleNumber)); return this; } public Builder withDataOptimizeDataSkippingEnable(boolean dataSkipping) { - clusteringConfig.setValue(DATA_OPTIMIZE_DATA_SKIPPING_ENABLE, String.valueOf(dataSkipping)); + clusteringConfig.setValue(LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE, String.valueOf(dataSkipping)); return this; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index bcf6d09390916..aeb77db187dfe 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -1231,25 +1231,25 @@ public String getClusteringSortColumns() { /** * Data layout optimize properties. */ - public boolean getSpaceFillingCurveDataOptimizeEnable() { - return getBoolean(HoodieClusteringConfig.SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE); + public boolean isLayoutOptimizationEnabled() { + return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE); } - public String getDataOptimizeStrategy() { - return getString(HoodieClusteringConfig.DATA_OPTIMIZE_STRATEGY); + public String getLayoutOptimizationStrategy() { + return getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY); } - public HoodieClusteringConfig.BuildCurveStrategyType getOptimizeBuildCurveMethod() { + public HoodieClusteringConfig.BuildCurveStrategyType getLayoutOptimizationCurveBuildMethod() { return HoodieClusteringConfig.BuildCurveStrategyType.fromValue( - getString(HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_STRATEGY)); + getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD)); } - public int getOptimizeSampleNumber() { - return getInt(HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER); + public int getLayoutOptimizationSampleSize() { + return getInt(HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE); } - public boolean getOptimizeEnableDataSkipping() { - return getBoolean(HoodieClusteringConfig.DATA_OPTIMIZE_DATA_SKIPPING_ENABLE); + public boolean isDataSkippingEnabled() { + return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE); } /** diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java deleted file mode 100644 index f8420e8374f8d..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.optimize; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import sun.misc.Unsafe; - -import java.lang.reflect.Field; -import java.nio.ByteOrder; -import java.security.AccessController; -import java.security.PrivilegedAction; - -/** - * This class is copied from hbase used by Lexicographically comparison algorithm. - * we use Lexicographically comparison algorithm to do sort for z-values which needed by z-order. - * and the unsafe comparision algorithm is a faster implementation than java implementation - */ -public class UnsafeAccess { - private static final Logger LOG = LoggerFactory.getLogger(UnsafeAccess.class); - public static final Unsafe THEUNSAFE; - - /** The offset to the first element in a byte array. */ - public static final long BYTE_ARRAY_BASE_OFFSET; - - public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() - .equals(ByteOrder.LITTLE_ENDIAN); - - // This number limits the number of bytes to copy per call to Unsafe's - // copyMemory method. A limit is imposed to allow for safepoint polling - // during a large copy - static final long UNSAFE_COPY_THRESHOLD = 1024L * 1024L; - static { - THEUNSAFE = (Unsafe) AccessController.doPrivileged(new PrivilegedAction() { - @Override - public Object run() { - try { - Field f = Unsafe.class.getDeclaredField("theUnsafe"); - f.setAccessible(true); - return f.get(null); - } catch (Throwable e) { - LOG.warn("sun.misc.Unsafe is not accessible", e); - } - return null; - } - }); - - if (THEUNSAFE != null) { - BYTE_ARRAY_BASE_OFFSET = THEUNSAFE.arrayBaseOffset(byte[].class); - } else { - BYTE_ARRAY_BASE_OFFSET = -1; - } - } - - private UnsafeAccess() { - - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java index 8d4217c4be5c7..3aa808075d330 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java @@ -18,24 +18,10 @@ package org.apache.hudi.optimize; -import sun.misc.Unsafe; - import java.nio.charset.Charset; public class ZOrderingUtil { - static final Unsafe THEUNSAFE; - public static final int SIZEOF_LONG = Long.SIZE / Byte.SIZE; - - static { - THEUNSAFE = UnsafeAccess.THEUNSAFE; - - // sanity check - this should never fail - if (THEUNSAFE.arrayIndexScale(byte[].class) != 1) { - throw new AssertionError(); - } - } - /** * Lexicographically compare two arrays. * copy from hbase @@ -50,45 +36,17 @@ public class ZOrderingUtil { public static int compareTo(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2) { // Short circuit equal case - if (buffer1 == buffer2 && offset1 == offset2 && length1 == length2) { + if (buffer1 == buffer2 + && offset1 == offset2 + && length1 == length2) { return 0; } - final int stride = 8; - final int minLength = Math.min(length1, length2); - int strideLimit = minLength & ~(stride - 1); - final long offset1Adj = offset1 + UnsafeAccess.BYTE_ARRAY_BASE_OFFSET; - final long offset2Adj = offset2 + UnsafeAccess.BYTE_ARRAY_BASE_OFFSET; - int i; - - /* - * Compare 8 bytes at a time. Benchmarking on x86 shows a stride of 8 bytes is no slower - * than 4 bytes even on 32-bit. On the other hand, it is substantially faster on 64-bit. - */ - for (i = 0; i < strideLimit; i += stride) { - long lw = THEUNSAFE.getLong(buffer1, offset1Adj + i); - long rw = THEUNSAFE.getLong(buffer2, offset2Adj + i); - if (lw != rw) { - if (!UnsafeAccess.LITTLE_ENDIAN) { - return ((lw + Long.MIN_VALUE) < (rw + Long.MIN_VALUE)) ? -1 : 1; - } - - /* - * We want to compare only the first index where left[index] != right[index]. This - * corresponds to the least significant nonzero byte in lw ^ rw, since lw and rw are - * little-endian. Long.numberOfTrailingZeros(diff) tells us the least significant - * nonzero bit, and zeroing out the first three bits of L.nTZ gives us the shift to get - * that least significant nonzero byte. This comparison logic is based on UnsignedBytes - * comparator from guava v21 - */ - int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7; - return ((int) ((lw >>> n) & 0xFF)) - ((int) ((rw >>> n) & 0xFF)); - } - } - - // The epilogue to cover the last (minLength % stride) elements. - for (; i < minLength; i++) { - int a = (buffer1[offset1 + i] & 0xFF); - int b = (buffer2[offset2 + i] & 0xFF); + // Bring WritableComparator code local + int end1 = offset1 + length1; + int end2 = offset2 + length2; + for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) { + int a = (buffer1[i] & 0xff); + int b = (buffer2[j] & 0xff); if (a != b) { return a - b; } @@ -96,7 +54,7 @@ public static int compareTo(byte[] buffer1, int offset1, int length1, return length1 - length2; } - private static byte[] paddingTo8Byte(byte[] a) { + public static byte[] paddingTo8Byte(byte[] a) { if (a.length == 8) { return a; } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java index 0dedec6486f1f..173276d984df0 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java @@ -381,9 +381,7 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, JavaRDD performClusteringWithRecordsRDD(final JavaRDD> getPartitioner(Map strategyParams, Schema schema) { - if (getWriteConfig().getSpaceFillingCurveDataOptimizeEnable()) { + if (getWriteConfig().isLayoutOptimizationEnabled()) { // sort input records by z-order/hilbert return Option.of(new RDDSpatialCurveOptimizationSortPartitioner((HoodieSparkEngineContext) getEngineContext(), getWriteConfig(), HoodieAvroUtils.addMetadataFields(schema))); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java index ddea51bd006d6..fa12159eeac62 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java @@ -33,10 +33,10 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; +import org.apache.spark.ZCurveOptimizeHelper; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.Zoptimize$; /** * A partitioner that does spartial curve optimization sorting based on specified column values for each RDD partition. @@ -77,12 +77,12 @@ private JavaRDD prepareGenericRecord(JavaRDD> inp Dataset originDF = AvroConversionUtils.createDataFrame(genericRecordJavaRDD.rdd(), schema.toString(), sparkEngineContext.getSqlContext().sparkSession()); Dataset zDataFrame; - switch (config.getOptimizeBuildCurveMethod()) { + switch (config.getLayoutOptimizationCurveBuildMethod()) { case DIRECT: - zDataFrame = Zoptimize$.MODULE$.createZIndexedDataFrameByMapValue(originDF, config.getClusteringSortColumns(), numOutputGroups); + zDataFrame = ZCurveOptimizeHelper.createZIndexedDataFrameByMapValue(originDF, config.getClusteringSortColumns(), numOutputGroups); break; case SAMPLE: - zDataFrame = Zoptimize$.MODULE$.createZIndexedDataFrameBySample(originDF, config.getClusteringSortColumns(), numOutputGroups); + zDataFrame = ZCurveOptimizeHelper.createZIndexedDataFrameBySample(originDF, config.getClusteringSortColumns(), numOutputGroups); break; default: throw new HoodieException("Not a valid build curve method for doWriteOperation: "); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index e66c5614d8d77..8e4471010f9c0 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -73,8 +73,8 @@ import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.spark.ZCurveOptimizeHelper; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.sql.Zoptimize$; import scala.collection.JavaConversions; import java.io.IOException; @@ -177,9 +177,9 @@ private void updateOptimizeOperationStatistics(HoodieEngineContext context, List return; } HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext)context; - Zoptimize$.MODULE$.saveStatisticsInfo(sparkEngineContext + ZCurveOptimizeHelper.saveStatisticsInfo(sparkEngineContext .getSqlContext().sparkSession().read().load(JavaConversions.asScalaBuffer(touchFiles)), - cols, indexPath, instantTime, JavaConversions.asScalaBuffer(validateCommits)); + cols, indexPath, instantTime, validateCommits); LOG.info(String.format("save statistic info sucessfully at commitTime: %s", instantTime)); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java new file mode 100644 index 0000000000000..7ba1c9465bfd0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java @@ -0,0 +1,355 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark; + +import scala.collection.JavaConversions; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.HoodieSparkUtils$; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.optimize.ZOrderingUtil; +import org.apache.parquet.io.api.Binary; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.Row$; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.hudi.execution.RangeSampleSort$; +import org.apache.spark.sql.hudi.execution.ZorderingBinarySort; +import org.apache.spark.sql.types.BinaryType; +import org.apache.spark.sql.types.BinaryType$; +import org.apache.spark.sql.types.BooleanType; +import org.apache.spark.sql.types.ByteType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.DoubleType; +import org.apache.spark.sql.types.FloatType; +import org.apache.spark.sql.types.IntegerType; +import org.apache.spark.sql.types.LongType; +import org.apache.spark.sql.types.LongType$; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.ShortType; +import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.StringType$; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType$; +import org.apache.spark.sql.types.TimestampType; +import org.apache.spark.util.SerializableConfiguration; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class ZCurveOptimizeHelper { + + private static final String SPARK_JOB_DESCRIPTION = "spark.job.description"; + + /** + * Create z-order DataFrame directly + * first, map all base type data to byte[8], then create z-order DataFrame + * only support base type data. long,int,short,double,float,string,timestamp,decimal,date,byte + * this method is more effective than createZIndexDataFrameBySample + * + * @param df a spark DataFrame holds parquet files to be read. + * @param zCols z-sort cols + * @param fileNum spark partition num + * @return a dataFrame sorted by z-order. + */ + public static Dataset createZIndexedDataFrameByMapValue(Dataset df, List zCols, int fileNum) { + Map columnsMap = Arrays.stream(df.schema().fields()).collect(Collectors.toMap(e -> e.name(), e -> e)); + int fieldNum = df.schema().fields().length; + List checkCols = zCols.stream().filter(f -> columnsMap.containsKey(f)).collect(Collectors.toList()); + if (zCols.size() != checkCols.size()) { + return df; + } + // only one col to sort, no need to use z-order + if (zCols.size() == 1) { + return df.repartitionByRange(fieldNum, org.apache.spark.sql.functions.col(zCols.get(0))); + } + Map fieldMap = zCols + .stream().collect(Collectors.toMap(e -> Arrays.asList(df.schema().fields()).indexOf(columnsMap.get(e)), e -> columnsMap.get(e))); + // z-sort + JavaRDD sortedRdd = df.toJavaRDD().map(row -> { + List zBytesList = fieldMap.entrySet().stream().map(entry -> { + int index = entry.getKey(); + StructField field = entry.getValue(); + DataType dataType = field.dataType(); + if (dataType instanceof LongType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getLong(index)); + } else if (dataType instanceof DoubleType) { + return ZOrderingUtil.doubleTo8Byte(row.isNullAt(index) ? Double.MAX_VALUE : row.getDouble(index)); + } else if (dataType instanceof IntegerType) { + return ZOrderingUtil.intTo8Byte(row.isNullAt(index) ? Integer.MAX_VALUE : row.getInt(index)); + } else if (dataType instanceof FloatType) { + return ZOrderingUtil.doubleTo8Byte(row.isNullAt(index) ? Float.MAX_VALUE : row.getFloat(index)); + } else if (dataType instanceof StringType) { + return ZOrderingUtil.utf8To8Byte(row.isNullAt(index) ? "" : row.getString(index)); + } else if (dataType instanceof DateType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getDate(index).getTime()); + } else if (dataType instanceof TimestampType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getTimestamp(index).getTime()); + } else if (dataType instanceof ByteType) { + return ZOrderingUtil.byteTo8Byte(row.isNullAt(index) ? Byte.MAX_VALUE : row.getByte(index)); + } else if (dataType instanceof ShortType) { + return ZOrderingUtil.intTo8Byte(row.isNullAt(index) ? Short.MAX_VALUE : row.getShort(index)); + } else if (dataType instanceof DecimalType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getDecimal(index).longValue()); + } else if (dataType instanceof BooleanType) { + boolean value = row.isNullAt(index) ? false : row.getBoolean(index); + return ZOrderingUtil.intTo8Byte(value ? 1 : 0); + } else if (dataType instanceof BinaryType) { + return ZOrderingUtil.paddingTo8Byte(row.isNullAt(index) ? new byte[] {0} : (byte[]) row.get(index)); + } + return null; + }).filter(f -> f != null).collect(Collectors.toList()); + byte[][] zBytes = new byte[zBytesList.size()][]; + for (int i = 0; i < zBytesList.size(); i++) { + zBytes[i] = zBytesList.get(i); + } + List zVaules = new ArrayList<>(); + zVaules.addAll(scala.collection.JavaConverters.bufferAsJavaListConverter(row.toSeq().toBuffer()).asJava()); + zVaules.add(ZOrderingUtil.interleaving(zBytes, 8)); + return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(zVaules)); + }).sortBy(f -> new ZorderingBinarySort((byte[]) f.get(fieldNum)), true, fileNum); + + // create new StructType + List newFields = new ArrayList<>(); + newFields.addAll(Arrays.asList(df.schema().fields())); + newFields.add(new StructField("zIndex", BinaryType$.MODULE$, true, Metadata.empty())); + + // create new DataFrame + return df.sparkSession().createDataFrame(sortedRdd, StructType$.MODULE$.apply(newFields)).drop("zIndex"); + } + + public static Dataset createZIndexedDataFrameByMapValue(Dataset df, String zCols, int fileNum) { + if (zCols == null || zCols.isEmpty() || fileNum <= 0) { + return df; + } + return createZIndexedDataFrameByMapValue(df, + Arrays.stream(zCols.split(",")).map(f -> f.trim()).collect(Collectors.toList()), fileNum); + } + + public static Dataset createZIndexedDataFrameBySample(Dataset df, List zCols, int fileNum) { + return RangeSampleSort$.MODULE$.sortDataFrameBySample(df, JavaConversions.asScalaBuffer(zCols), fileNum); + } + + public static Dataset createZIndexedDataFrameBySample(Dataset df, String zCols, int fileNum) { + if (zCols == null || zCols.isEmpty() || fileNum <= 0) { + return df; + } + return createZIndexedDataFrameBySample(df, Arrays.stream(zCols.split(",")).map(f -> f.trim()).collect(Collectors.toList()), fileNum); + } + + /** + * Parse min/max statistics stored in parquet footers for z-sort cols. + * no support collect statistics from timeStampType, since parquet file has not collect the statistics for timeStampType. + * to do adapt for rfc-27 + * + * @param df a spark DataFrame holds parquet files to be read. + * @param cols z-sort cols + * @return a dataFrame holds all statistics info. + */ + public static Dataset getMinMaxValue(Dataset df, List cols) { + Map columnsMap = Arrays.stream(df.schema().fields()).collect(Collectors.toMap(e -> e.name(), e -> e.dataType())); + + List scanFiles = Arrays.asList(df.inputFiles()); + SparkContext sc = df.sparkSession().sparkContext(); + JavaSparkContext jsc = new JavaSparkContext(sc); + + SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration()); + int numParallelism = (scanFiles.size() / 3 + 1); + List> colMinMaxInfos = new ArrayList<>(); + String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION); + try { + String description = "Listing parquet column statistics"; + jsc.setJobDescription(description); + colMinMaxInfos = jsc.parallelize(scanFiles, numParallelism).mapPartitions(paths -> { + Configuration conf = serializableConfiguration.value(); + ParquetUtils parquetUtils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET); + List>> results = new ArrayList<>(); + while (paths.hasNext()) { + String path = paths.next(); + results.add(parquetUtils.readRangeFromParquetMetadata(conf, new Path(path), cols)); + } + return results.stream().flatMap(f -> f.stream()).iterator(); + }).collect(); + } finally { + jsc.setJobDescription(previousJobDescription); + } + + Map>> fileToStatsListMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(e -> e.getFilePath())); + JavaRDD allMetaDataRDD = jsc.parallelize(fileToStatsListMap.values().stream().collect(Collectors.toList()), 1).map(f -> { + int colSize = f.size(); + if (colSize == 0) { + return null; + } else { + List rows = new ArrayList<>(); + rows.add(f.get(0).getFilePath()); + cols.stream().forEach(col -> { + HoodieColumnRangeMetadata currentColRangeMetaData = + f.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(col)).findFirst().orElse(null); + DataType colType = columnsMap.get(col); + if (currentColRangeMetaData == null || colType == null) { + throw new HoodieException(String.format("cannot collect min/max statistics for col: %s", col)); + } + if (colType instanceof IntegerType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof DoubleType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof StringType) { + String minString = new String(((Binary)currentColRangeMetaData.getMinValue()).getBytes()); + String maxString = new String(((Binary)currentColRangeMetaData.getMaxValue()).getBytes()); + rows.add(minString); + rows.add(maxString); + } else if (colType instanceof DecimalType) { + Double minDecimal = Double.parseDouble(currentColRangeMetaData.getStringifier().stringify(Long.valueOf(currentColRangeMetaData.getMinValue().toString()))); + Double maxDecimal = Double.parseDouble(currentColRangeMetaData.getStringifier().stringify(Long.valueOf(currentColRangeMetaData.getMaxValue().toString()))); + rows.add(BigDecimal.valueOf(minDecimal)); + rows.add(BigDecimal.valueOf(maxDecimal)); + } else if (colType instanceof DateType) { + rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getStringifier().stringify((int)currentColRangeMetaData.getMinValue()))); + rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getStringifier().stringify((int)currentColRangeMetaData.getMaxValue()))); + } else if (colType instanceof LongType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof ShortType) { + rows.add(Short.parseShort(currentColRangeMetaData.getMinValue().toString())); + rows.add(Short.parseShort(currentColRangeMetaData.getMaxValue().toString())); + } else if (colType instanceof FloatType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof BinaryType) { + rows.add(((Binary)currentColRangeMetaData.getMinValue()).getBytes()); + rows.add(((Binary)currentColRangeMetaData.getMaxValue()).getBytes()); + } else if (colType instanceof BooleanType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof ByteType) { + rows.add(Byte.valueOf(currentColRangeMetaData.getMinValue().toString())); + rows.add(Byte.valueOf(currentColRangeMetaData.getMaxValue().toString())); + } else { + throw new HoodieException(String.format("Not support type: %s", colType)); + } + rows.add(currentColRangeMetaData.getNumNulls()); + }); + return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(rows)); + } + }).filter(f -> f != null); + List allMetaDataSchema = new ArrayList<>(); + allMetaDataSchema.add(new StructField("file", StringType$.MODULE$, true, Metadata.empty())); + cols.forEach(col -> { + allMetaDataSchema.add(new StructField(col + "_minValue", columnsMap.get(col), true, Metadata.empty())); + allMetaDataSchema.add(new StructField(col + "_maxValue", columnsMap.get(col), true, Metadata.empty())); + allMetaDataSchema.add(new StructField(col + "_num_nulls", LongType$.MODULE$, true, Metadata.empty())); + }); + return df.sparkSession().createDataFrame(allMetaDataRDD, StructType$.MODULE$.apply(allMetaDataSchema)); + } + + public static Dataset getMinMaxValue(Dataset df, String cols) { + List rawCols = Arrays.asList(cols.split(",")).stream().map(f -> f.trim()).collect(Collectors.toList()); + return getMinMaxValue(df, rawCols); + } + + /** + * Update statistics info. + * this method will update old index table by full out join, + * and save the updated table into a new index table based on commitTime. + * old index table will be cleaned also. + * + * @param df a spark DataFrame holds parquet files to be read. + * @param cols z-sort cols. + * @param indexPath index store path. + * @param commitTime current operation commitTime. + * @param validateCommits all validate commits for current table. + * @return + */ + public static void saveStatisticsInfo(Dataset df, String cols, String indexPath, String commitTime, List validateCommits) { + Path savePath = new Path(indexPath, commitTime); + SparkSession spark = df.sparkSession(); + FileSystem fs = FSUtils.getFs(indexPath, spark.sparkContext().hadoopConfiguration()); + Dataset statisticsDF = ZCurveOptimizeHelper.getMinMaxValue(df, cols); + // try to find last validate index table from index path + try { + if (fs.exists(new Path(indexPath))) { + List allIndexTables = Arrays + .stream(fs.listStatus(new Path(indexPath))).filter(f -> f.isDirectory()).map(f -> f.getPath().getName()).collect(Collectors.toList()); + List candidateIndexTables = allIndexTables.stream().filter(f -> validateCommits.contains(f)).sorted().collect(Collectors.toList()); + List residualTables = allIndexTables.stream().filter(f -> !validateCommits.contains(f)).collect(Collectors.toList()); + Option latestIndexData = Option.empty(); + if (!candidateIndexTables.isEmpty()) { + latestIndexData = Option.of(spark.read().load(new Path(indexPath, candidateIndexTables.get(candidateIndexTables.size() - 1)).toString())); + // clean old index table, keep at most 1 index table. + candidateIndexTables.remove(candidateIndexTables.size() - 1); + candidateIndexTables.forEach(f -> { + try { + fs.delete(new Path(indexPath, f)); + } catch (IOException ie) { + throw new HoodieException(ie); + } + }); + } + + // clean residualTables + // retried cluster operations at the same instant time is also considered, + // the residual files produced by retried are cleaned up before save statistics + // save statistics info to index table which named commitTime + residualTables.forEach(f -> { + try { + fs.delete(new Path(indexPath, f)); + } catch (IOException ie) { + throw new HoodieException(ie); + } + }); + + if (latestIndexData.isPresent() && latestIndexData.get().schema().equals(statisticsDF.schema())) { + // update the statistics info + String originalTable = "indexTable_" + java.util.UUID.randomUUID().toString().replace("-", ""); + String updateTable = "updateTable_" + java.util.UUID.randomUUID().toString().replace("-", ""); + latestIndexData.get().registerTempTable(originalTable); + statisticsDF.registerTempTable(updateTable); + // update table by full out join + List columns = Arrays.asList(statisticsDF.schema().fieldNames()); + spark.sql(HoodieSparkUtils$ + .MODULE$.createMergeSql(originalTable, updateTable, JavaConversions.asScalaBuffer(columns))).repartition(1).write().save(savePath.toString()); + } + } else { + statisticsDF.repartition(1).write().mode("overwrite").save(savePath.toString()); + } + } catch (IOException e) { + throw new HoodieException(e); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala index 172bbc4919592..ce39843275815 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala @@ -19,6 +19,7 @@ package org.apache.hudi import java.util.Properties + import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.{FileSystem, Path} @@ -35,6 +36,7 @@ import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal} import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex} +import org.apache.spark.sql.functions._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} @@ -283,4 +285,43 @@ object HoodieSparkUtils extends SparkAdapterSupport { s"${tableSchema.fieldNames.mkString(",")}") AttributeReference(columnName, field.get.dataType, field.get.nullable)() } + + /** + * Create merge sql to merge leftTable and right table. + * + * @param leftTable table name. + * @param rightTable table name. + * @param cols merged cols. + * @return merge sql. + */ + def createMergeSql(leftTable: String, rightTable: String, cols: Seq[String]): String = { + var selectsql = "" + for (i <- (0 to cols.size-1)) { + selectsql = selectsql + s" if (${leftTable}.${cols(0)} is null, ${rightTable}.${cols(i)}, ${leftTable}.${cols(i)}) as ${cols(i)} ," + } + "select " + selectsql.dropRight(1) + s" from ${leftTable} full join ${rightTable} on ${leftTable}.${cols(0)} = ${rightTable}.${cols(0)}" + } + + /** + * Collect min/max statistics for candidate cols. + * support all col types. + * + * @param df dataFrame holds read files. + * @param cols candidate cols to collect statistics. + * @return + */ + def getMinMaxValueSpark(df: DataFrame, cols: Seq[String]): DataFrame = { + val sqlContext = df.sparkSession.sqlContext + import sqlContext.implicits._ + + val values = cols.flatMap(c => Seq( min(col(c)).as(c + "_minValue"), max(col(c)).as(c + "_maxValue"), count(c).as(c + "_noNullCount"))) + val valueCounts = count("*").as("totalNum") + val projectValues = Seq(col("file")) ++ cols.flatMap(c => + Seq(col(c + "_minValue"), col(c + "_maxValue"), expr(s"totalNum - ${c + "_noNullCount"}").as(c + "_num_nulls"))) + + val result = df.select(input_file_name() as "file", col("*")) + .groupBy($"file") + .agg(valueCounts, values: _*).select(projectValues:_*) + result + } } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala deleted file mode 100644 index e4623b443a273..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/Zoptimize.scala +++ /dev/null @@ -1,830 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -import java.sql.Date -import java.util.concurrent.{Executors, ThreadPoolExecutor} - -import com.google.common.util.concurrent.ThreadFactoryBuilder -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.hudi.config.HoodieClusteringConfig -import org.apache.parquet.hadoop.ParquetFileReader -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions.{Alias, And, Ascending, Attribute, AttributeReference, BoundReference, EqualNullSafe, EqualTo, Expression, ExtractValue, GetStructField, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Literal, Not, Or, SortOrder, StartsWith, UnsafeProjection} -import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering -import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat -import org.apache.spark.sql.functions._ -import org.apache.hudi.optimize.ZOrderingUtil -import org.apache.spark.sql.hudi.execution._ -import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types._ -import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.unsafe.types.UTF8String -import org.apache.spark.util.{MutablePair, SerializableConfiguration} - -import scala.collection.JavaConverters._ -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.concurrent.duration._ -import scala.concurrent.{ExecutionContext, Future} - -object Zoptimize { - - case class FileStats(val minVal: String, val maxVal: String, val num_nulls: Int = 0) - case class ColumnFileStats(val fileName: String, val colName: String, val minVal: String, val maxVal: String, val num_nulls: Int = 0) - - def createZIndexedDataFrameBySample(df: DataFrame, zCols: String, fileNum: Int): DataFrame = { - if (zCols == null || zCols.isEmpty) { - df - } else { - createZIndexedDataFrameBySample(df, zCols.split(",").map(_.trim), fileNum) - } - } - - /** - * create z-order DataFrame by sample - * first, sample origin data to get z-cols bounds, then create z-order DataFrame - * support all type data. - * this method need more resource and cost more time than createZIndexedDataFrameByMapValue - */ - def createZIndexedDataFrameBySample(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { - val spark = df.sparkSession - val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap - val fieldNum = df.schema.fields.length - val checkCols = zCols.filter(col => columnsMap(col) != null) - - if (zCols.isEmpty || checkCols.isEmpty) { - df - } else { - val zFields = zCols.map { col => - val newCol = columnsMap(col) - if (newCol == null) { - (-1, null) - } else { - newCol.dataType match { - case LongType | DoubleType | FloatType | StringType | IntegerType | DateType | TimestampType | ShortType | ByteType => - (df.schema.fields.indexOf(newCol), newCol) - case d: DecimalType => - (df.schema.fields.indexOf(newCol), newCol) - case _ => - (-1, null) - } - } - }.filter(_._1 != -1) - // Complex type found, use createZIndexedDataFrameByRange - if (zFields.length != zCols.length) { - return createZIndexedDataFrameByRange(df, zCols, fieldNum) - } - - val rawRdd = df.rdd - val sampleRdd = rawRdd.map { row => - val values = zFields.map { case (index, field) => - field.dataType match { - case LongType => - if (row.isNullAt(index)) Long.MaxValue else row.getLong(index) - case DoubleType => - if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getDouble(index)) - case IntegerType => - if (row.isNullAt(index)) Long.MaxValue else row.getInt(index).toLong - case FloatType => - if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble) - case StringType => - if (row.isNullAt(index)) "" else row.getString(index) - case DateType => - if (row.isNullAt(index)) Long.MaxValue else row.getDate(index).getTime - case TimestampType => - if (row.isNullAt(index)) Long.MaxValue else row.getTimestamp(index).getTime - case ByteType => - if (row.isNullAt(index)) Long.MaxValue else row.getByte(index).toLong - case ShortType => - if (row.isNullAt(index)) Long.MaxValue else row.getShort(index).toLong - case d: DecimalType => - if (row.isNullAt(index)) Long.MaxValue else row.getDecimal(index).longValue() - case _ => - null - } - }.filter(v => v != null).toArray - (values, null) - } - val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( - HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.key, - HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.defaultValue.toString).toInt - val sample = new RangeSample(zOrderBounds, sampleRdd) - val rangeBounds = sample.getRangeBounds() - val sampleBounds = { - val candidateColNumber = rangeBounds.head._1.length - (0 to candidateColNumber - 1).map { i => - val colRangeBound = rangeBounds.map(x => (x._1(i), x._2)) - - if (colRangeBound.head._1.isInstanceOf[String]) { - sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(String, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[String]) - } else { - sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(Long, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[Long]) - } - } - } - - // expand bounds. - // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, - // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" - val maxLength = sampleBounds.map(_.length).max - val expandSampleBoundsWithFactor = sampleBounds.map { bound => - val fillFactor = maxLength / bound.size - val newBound = new Array[Double](bound.length * fillFactor) - if (bound.isInstanceOf[Array[Long]] && fillFactor > 1) { - val longBound = bound.asInstanceOf[Array[Long]] - for (i <- 0 to bound.length - 1) { - for (j <- 0 to fillFactor - 1) { - // sample factor shoud not be too large, so it's ok to use 1 / fillfactor as slice - newBound(j + i*(fillFactor)) = longBound(i) + (j + 1) * (1 / fillFactor.toDouble) - } - } - (newBound, fillFactor) - } else { - (bound, 0) - } - } - - val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) - - val indexRdd = rawRdd.mapPartitions { iter => - val expandBoundsWithFactor = boundBroadCast.value - val maxBoundNum = expandBoundsWithFactor.map(_._1.length).max - val longDecisionBound = new RawDecisionBound(Ordering[Long]) - val doubleDecisionBound = new RawDecisionBound(Ordering[Double]) - val stringDecisionBound = new RawDecisionBound(Ordering[String]) - import java.util.concurrent.ThreadLocalRandom - val threadLocalRandom = ThreadLocalRandom.current - - def getRank(rawIndex: Int, value: Long, isNull: Boolean): Int = { - val (expandBound, factor) = expandBoundsWithFactor(rawIndex) - if (isNull) { - expandBound.length + 1 - } else { - if (factor > 1) { - doubleDecisionBound.getBound(value + (threadLocalRandom.nextInt(factor) + 1)*(1 / factor.toDouble), expandBound.asInstanceOf[Array[Double]]) - } else { - longDecisionBound.getBound(value, expandBound.asInstanceOf[Array[Long]]) - } - } - } - - iter.map { row => - val values = zFields.zipWithIndex.map { case ((index, field), rawIndex) => - field.dataType match { - case LongType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else row.getLong(index), isNull) - case DoubleType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getDouble(index)), isNull) - case IntegerType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else row.getInt(index).toLong, isNull) - case FloatType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble), isNull) - case StringType => - val factor = maxBoundNum.toDouble / expandBoundsWithFactor(rawIndex)._1.length - if (row.isNullAt(index)) { - maxBoundNum + 1 - } else { - val currentRank = stringDecisionBound.getBound(row.getString(index), expandBoundsWithFactor(rawIndex)._1.asInstanceOf[Array[String]]) - if (factor > 1) { - (currentRank*factor).toInt + threadLocalRandom.nextInt(factor.toInt) - } else { - currentRank - } - } - case DateType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else row.getDate(index).getTime, isNull) - case TimestampType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else row.getTimestamp(index).getTime, isNull) - case ByteType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else row.getByte(index).toLong, isNull) - case ShortType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else row.getShort(index).toLong, isNull) - case d: DecimalType => - val isNull = row.isNullAt(index) - getRank(rawIndex, if (isNull) 0 else row.getDecimal(index).longValue(), isNull) - case _ => - -1 - } - }.filter(v => v != -1).map(ZOrderingUtil.intTo8Byte(_)).toArray - val zValues = ZOrderingUtil.interleaving(values, 8) - Row.fromSeq(row.toSeq ++ Seq(zValues)) - } - }.sortBy(x => ZorderingBinarySort(x.getAs[Array[Byte]](fieldNum)), numPartitions = fileNum) - val newDF = df.sparkSession.createDataFrame(indexRdd, StructType( - df.schema.fields ++ Seq( - StructField(s"zindex", - BinaryType, false)) - )) - newDF.drop("zindex") - } - } - - /** - * create z-order DataFrame by sample - * support all col types - */ - def createZIndexedDataFrameByRange(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { - val spark = df.sparkSession - val internalRdd = df.queryExecution.toRdd - val schema = df.schema - val outputAttributes = df.queryExecution.analyzed.output - val sortingExpressions = outputAttributes.filter(p => zCols.contains(p.name)) - if (sortingExpressions.length == 0 || sortingExpressions.length != zCols.size) { - df - } else { - val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( - HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.key, - HoodieClusteringConfig.DATA_OPTIMIZE_BUILD_CURVE_SAMPLE_NUMBER.defaultValue.toString).toInt - - val sampleRdd = internalRdd.mapPartitionsInternal { iter => - val projection = UnsafeProjection.create(sortingExpressions, outputAttributes) - val mutablePair = new MutablePair[InternalRow, Null]() - // Internally, RangePartitioner runs a job on the RDD that samples keys to compute - // partition bounds. To get accurate samples, we need to copy the mutable keys. - iter.map(row => mutablePair.update(projection(row).copy(), null)) - } - - val orderings = sortingExpressions.map(SortOrder(_, Ascending)).zipWithIndex.map { case (ord, i) => - ord.copy(child = BoundReference(i, ord.dataType, ord.nullable)) - } - - val lazyGeneratedOrderings = orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) - - val sample = new RangeSample(zOrderBounds, sampleRdd) - - val rangeBounds = sample.getRangeBounds() - - implicit val ordering1 = lazyGeneratedOrderings(0) - - val sampleBounds = sample.determineRowBounds(rangeBounds, math.min(zOrderBounds, rangeBounds.length), lazyGeneratedOrderings, sortingExpressions) - - val origin_orderings = sortingExpressions.map(SortOrder(_, Ascending)).map { ord => - ord.copy(child = BoundReference(0, ord.dataType, ord.nullable)) - } - - val origin_lazyGeneratedOrderings = origin_orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) - - // expand bounds. - // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, - // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" - val maxLength = sampleBounds.map(_.length).max - val expandSampleBoundsWithFactor = sampleBounds.map { bound => - val fillFactor = maxLength / bound.size.toDouble - (bound, fillFactor) - } - - val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) - - val indexRdd = internalRdd.mapPartitionsInternal { iter => - val boundsWithFactor = boundBroadCast.value - import java.util.concurrent.ThreadLocalRandom - val threadLocalRandom = ThreadLocalRandom.current - val maxBoundNum = boundsWithFactor.map(_._1.length).max - val origin_Projections = sortingExpressions.map { se => - UnsafeProjection.create(Seq(se), outputAttributes) - } - - iter.map { unsafeRow => - val interleaveValues = origin_Projections.zip(origin_lazyGeneratedOrderings).zipWithIndex.map { case ((rowProject, lazyOrdering), index) => - val row = rowProject(unsafeRow) - val decisionBound = new RawDecisionBound(lazyOrdering) - if (row.isNullAt(0)) { - maxBoundNum + 1 - } else { - val (bound, factor) = boundsWithFactor(index) - if (factor > 1) { - val currentRank = decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) - currentRank*factor.toInt + threadLocalRandom.nextInt(factor.toInt) - } else { - decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) - } - } - }.toArray.map(ZOrderingUtil.intTo8Byte(_)) - val zValues = ZOrderingUtil.interleaving(interleaveValues, 8) - val mutablePair = new MutablePair[InternalRow, Array[Byte]]() - - mutablePair.update(unsafeRow, zValues) - } - }.sortBy(x => ZorderingBinarySort(x._2), numPartitions = fileNum).map(_._1) - spark.internalCreateDataFrame(indexRdd, schema) - } - } - - def getMinMaxValueSpark(df: DataFrame, cols: Seq[String]): DataFrame = { - val sqlContext = df.sparkSession.sqlContext - import sqlContext.implicits._ - - val values = cols.flatMap(c => Seq( min(col(c)).as(c + "_minValue"), max(col(c)).as(c + "_maxValue"), count(c).as(c + "_noNullCount"))) - val valueCounts = count("*").as("totalNum") - val projectValues = Seq(col("file")) ++ cols.flatMap(c => - Seq(col(c + "_minValue"), col(c + "_maxValue"), expr(s"totalNum - ${c + "_noNullCount"}").as(c + "_num_nulls"))) - - val result = df.select(input_file_name() as "file", col("*")) - .groupBy($"file") - .agg(valueCounts, values: _*).select(projectValues:_*) - result - } - - def getMinMaxValue(df: DataFrame, zCols: String): DataFrame = { - - val rawCols = zCols.split(",").map(_.trim) - - val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap - - val cols = rawCols.filter { col => - if (columnsMap.contains(col)) { - columnsMap(col).dataType match { - case IntegerType | DoubleType | StringType | DateType | LongType | FloatType | ShortType => - true - case a: DecimalType => - true - case other => - false - } - } else { - false - } - } - - if (cols.size != rawCols.size) return getMinMaxValueSpark(df, rawCols) - - val inputFiles = df.inputFiles - val conf = df.sparkSession.sparkContext.hadoopConfiguration - - val startTime = System.nanoTime() - - val allMetaData: Array[ColumnFileStats] = if (inputFiles.length < 10) { - - val listParallelism = math.min(Runtime.getRuntime.availableProcessors()/2 + 1, inputFiles.length) - - val slicedInputFiles = inputFiles.grouped(listParallelism) - - val threadPool = { - val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat("columnStatics" + "-%d").build() - Executors.newFixedThreadPool(listParallelism, threadFactory).asInstanceOf[ThreadPoolExecutor] - } - - try { - implicit val executionContext = ExecutionContext.fromExecutor(threadPool) - val staticTasks = slicedInputFiles.map { paths => - Future { - paths.map(new Path(_)).flatMap { filePath => - val blocks = ParquetFileReader.readFooter(conf, filePath).getBlocks().asScala - blocks.flatMap(b => b.getColumns().asScala. - map(col => (col.getPath().toDotString(), - FileStats(col.getStatistics().minAsString(), col.getStatistics().maxAsString(), col.getStatistics.getNumNulls.toInt)))) - .groupBy(x => x._1).mapValues(v => v.map(vv => vv._2)). - mapValues(value => FileStats(value.map(_.minVal).min, value.map(_.maxVal).max, value.map(_.num_nulls).max)).toSeq. - map(x => ColumnFileStats(filePath.getName(), x._1, x._2.minVal, x._2.maxVal, x._2.num_nulls)) - }.filter(p => cols.contains(p.colName)) - } - } - - val futureResult = try { - val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait] - Future.sequence(staticTasks).result(Duration.Inf)(awaitPermission) - } catch { - case e: Throwable => - throw e - } - futureResult.flatMap(x => x).toArray - } finally { - threadPool.shutdown() - } - } else { - val sc = df.sparkSession.sparkContext - val serializableConfiguration = new SerializableConfiguration(conf) - val numParallelism = inputFiles.size/3 - val previousJobDescription = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION) - try { - val description = s"Listing parquet column statistics" - sc.setJobDescription(description) - sc.parallelize(inputFiles, numParallelism).mapPartitions { paths => - val hadoopConf = serializableConfiguration.value - paths.map(new Path(_)).flatMap { filePath => - val blocks = ParquetFileReader.readFooter(hadoopConf, filePath).getBlocks().asScala - blocks.flatMap(b => b.getColumns().asScala. - map(col => (col.getPath().toDotString(), - FileStats(col.getStatistics().minAsString(), col.getStatistics().maxAsString(), col.getStatistics.getNumNulls.toInt)))) - .groupBy(x => x._1).mapValues(v => v.map(vv => vv._2)). - mapValues(value => FileStats(value.map(_.minVal).min, value.map(_.maxVal).max, value.map(_.num_nulls).max)).toSeq. - map(x => ColumnFileStats(filePath.getName(), x._1, x._2.minVal, x._2.maxVal, x._2.num_nulls)) - }.filter(p => cols.contains(p.colName)) - }.collect() - } finally { - sc.setJobDescription(previousJobDescription) - } - } - - val allMetaDataRDD = df.sparkSession.sparkContext.parallelize(allMetaData.groupBy(x => x.fileName).mapValues { css => - val size = css.length - if (size == 0) { - null - } else { - val rows = new ArrayBuffer[Any]() - rows.append(css.head.fileName) - cols.foreach { col => - val cs = css.find(p => p.colName.equals(col)).get - columnsMap(cs.colName).dataType match { - case IntegerType => - rows.append(cs.minVal.toInt) - rows.append(cs.maxVal.toInt) - case DoubleType => - rows.append(cs.minVal.toDouble) - rows.append(cs.maxVal.toDouble) - case StringType => - rows.append(cs.minVal) - rows.append(cs.maxVal) - case a: DecimalType => - rows.append(BigDecimal(cs.minVal)) - rows.append(BigDecimal(cs.maxVal)) - case DateType => - rows.append(Date.valueOf(cs.minVal)) - rows.append(Date.valueOf(cs.maxVal)) - case LongType => - rows.append(cs.minVal.toLong) - rows.append(cs.maxVal.toLong) - case ShortType => - rows.append(cs.minVal.toShort) - rows.append(cs.maxVal.toShort) - case FloatType => - rows.append(cs.minVal.toFloat) - rows.append(cs.maxVal.toFloat) - } - rows.append(cs.num_nulls) - } - Row.fromSeq(rows) - } - }.map(_._2).filter(x => x != null).toSeq, 1) - - val allMetaDataSchema = { - val neededFields = mutable.ListBuffer[StructField]() - neededFields.append(new StructField("file", StringType, false)) - cols.foreach { col => - neededFields.append(columnsMap(col).copy(name = col + "_minValue")) - neededFields.append(columnsMap(col).copy(name = col + "_maxValue")) - neededFields.append(new StructField( col + "_num_nulls", IntegerType, true)) - } - StructType(neededFields) - } - - val metaDF = df.sparkSession.createDataFrame(allMetaDataRDD, allMetaDataSchema) - metaDF - } - - def createZIndexedDataFrameByMapValue(df: DataFrame, zCols: String, fileNum: Int): DataFrame = { - if (zCols == null || zCols.isEmpty) { - df - } else { - createZIndexedDataFrameByMapValue(df, zCols.split(",").map(_.trim), fileNum) - } - } - - /** - * create z-order DataFrame directly - * first, map all base type data to byte[8], then create z-order DataFrame - * only support base type data. long,int,short,double,float,string,timestamp,decimal,date,byte - * this method is more effective than createZIndexDataFrameBySample - */ - def createZIndexedDataFrameByMapValue(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { - val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap - val fieldNum = df.schema.fields.length - - val checkCols = zCols.filter( col => columnsMap(col) != null) - - if (zCols.length ==0 && checkCols.size != zCols.size) { - df - } else { - val zFields = zCols.map { col => - val newCol = columnsMap(col) - (df.schema.fields.indexOf(newCol), newCol) - } - - val newRDD = df.rdd.map { row => - val values = zFields.map { case (index, field) => - field.dataType match { - case LongType => - ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getLong(index)) - case DoubleType => - ZOrderingUtil.doubleTo8Byte(if (row.isNullAt(index)) Double.MaxValue else row.getDouble(index)) - case IntegerType => - ZOrderingUtil.intTo8Byte(if (row.isNullAt(index)) Int.MaxValue else row.getInt(index)) - case FloatType => - ZOrderingUtil.doubleTo8Byte(if (row.isNullAt(index)) Float.MaxValue else row.getFloat(index).toDouble) - case StringType => - ZOrderingUtil.utf8To8Byte(if (row.isNullAt(index)) "" else row.getString(index)) - case DateType => - ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getDate(index).getTime) - case TimestampType => - ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getTimestamp(index).getTime) - case ByteType => - ZOrderingUtil.byteTo8Byte(if (row.isNullAt(index)) Byte.MaxValue else row.getByte(index)) - case ShortType => - ZOrderingUtil.intTo8Byte(if (row.isNullAt(index)) Short.MaxValue else row.getShort(index).toInt) - case d: DecimalType => - ZOrderingUtil.longTo8Byte(if (row.isNullAt(index)) Long.MaxValue else row.getDecimal(index).longValue()) - case _ => - null - } - }.filter(v => v != null).toArray - val zValues = ZOrderingUtil.interleaving(values, 8) - Row.fromSeq(row.toSeq ++ Seq(zValues)) - }.sortBy(x => ZorderingBinarySort(x.getAs[Array[Byte]](fieldNum)), numPartitions = fileNum) - - val newDF = df.sparkSession.createDataFrame(newRDD, StructType( - df.schema.fields ++ Seq( - StructField(s"zindex", - BinaryType, false)) - )) - newDF.drop("zindex") - } - } - - /** - * create z_index filter and push those filters to index table to filter all candidate scan files. - * @param condition origin filter from query. - * @param indexSchema schema from index table. - * @return filters for index table. - */ - def createZindexFilter(condition: Expression, indexSchema: StructType): Expression = { - def buildExpressionInternal(colName: Seq[String], statisticValue: String): Expression = { - val appendColName = UnresolvedAttribute(colName).name + statisticValue - col(appendColName).expr - } - - def reWriteCondition(colName: Seq[String], conditionExpress: Expression): Expression = { - val appendColName = UnresolvedAttribute(colName).name + "_minValue" - if (indexSchema.exists(p => p.name == appendColName)) { - conditionExpress - } else { - Literal.TrueLiteral - } - } - - val minValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_minValue") - val maxValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_maxValue") - val num_nulls = (colName: Seq[String]) => buildExpressionInternal(colName, "_num_nulls") - - condition match { - // query filter "colA = b" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table - case EqualTo(attribute: AttributeReference, value: Literal) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) - // query filter "b = colA" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table - case EqualTo(value: Literal, attribute: AttributeReference) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) - // query filter "colA = null" convert it to "colA_num_nulls = null" for index table - case equalNullSafe @ EqualNullSafe(_: AttributeReference, _ @ Literal(null, _)) => - val colName = getTargetColNameParts(equalNullSafe.left) - reWriteCondition(colName, EqualTo(num_nulls(colName), equalNullSafe.right)) - // query filter "colA < b" convert it to "colA_minValue < b" for index table - case LessThan(attribute: AttributeReference, value: Literal) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName,LessThan(minValue(colName), value)) - // query filter "b < colA" convert it to "colA_maxValue > b" for index table - case LessThan(value: Literal, attribute: AttributeReference) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, GreaterThan(maxValue(colName), value)) - // query filter "colA > b" convert it to "colA_maxValue > b" for index table - case GreaterThan(attribute: AttributeReference, value: Literal) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, GreaterThan(maxValue(colName), value)) - // query filter "b > colA" convert it to "colA_minValue < b" for index table - case GreaterThan(value: Literal, attribute: AttributeReference) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, LessThan(minValue(colName), value)) - // query filter "colA <= b" convert it to "colA_minValue <= b" for index table - case LessThanOrEqual(attribute: AttributeReference, value: Literal) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) - // query filter "b <= colA" convert it to "colA_maxValue >= b" for index table - case LessThanOrEqual(value: Literal, attribute: AttributeReference) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, GreaterThanOrEqual(maxValue(colName), value)) - // query filter "colA >= b" convert it to "colA_maxValue >= b" for index table - case GreaterThanOrEqual(attribute: AttributeReference, right: Literal) => - val colName = getTargetColNameParts(attribute) - GreaterThanOrEqual(maxValue(colName), right) - // query filter "b >= colA" convert it to "colA_minValue <= b" for index table - case GreaterThanOrEqual(value: Literal, attribute: AttributeReference) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) - // query filter "colA is null" convert it to "colA_num_nulls > 0" for index table - case IsNull(attribute: AttributeReference) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, GreaterThan(num_nulls(colName), Literal(0))) - // query filter "colA is not null" convert it to "colA_num_nulls = 0" for index table - case IsNotNull(attribute: AttributeReference) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, EqualTo(num_nulls(colName), Literal(0))) - // query filter "colA in (a,b)" convert it to " (colA_minValue <= a and colA_maxValue >= a) or (colA_minValue <= b and colA_maxValue >= b) " for index table - case In(attribute: AttributeReference, list: Seq[Literal]) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, list.map { lit => - And(LessThanOrEqual(minValue(colName), lit), GreaterThanOrEqual(maxValue(colName), lit)) - }.reduce(Or)) - // query filter "colA like xxx" convert it to " (colA_minValue <= xxx and colA_maxValue >= xxx) or (colA_min start with xxx or colA_max start with xxx) " for index table - case StartsWith(attribute, v @ Literal(_: UTF8String, _)) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, Or(And(LessThanOrEqual(minValue(colName), v), GreaterThanOrEqual(maxValue(colName), v)) , - Or(StartsWith(minValue(colName), v), StartsWith(maxValue(colName), v)))) - // query filter "colA not in (a, b)" convert it to " (not( colA_minValue = a and colA_maxValue = a)) and (not( colA_minValue = b and colA_maxValue = b)) " for index table - case Not(In(attribute: AttributeReference, list: Seq[Literal])) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, list.map { lit => - Not(And(EqualTo(minValue(colName), lit), EqualTo(maxValue(colName), lit))) - }.reduce(And)) - // query filter "colA != b" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table - case Not(EqualTo(attribute: AttributeReference, value: Literal)) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) - // query filter "b != colA" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table - case Not(EqualTo(value: Literal, attribute: AttributeReference)) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) - // query filter "colA not like xxxx" convert it to "not ( colA_minValue startWith xxx and colA_maxValue startWith xxx)" for index table - case Not(StartsWith(attribute, value @ Literal(_: UTF8String, _))) => - val colName = getTargetColNameParts(attribute) - reWriteCondition(colName, Not(And(StartsWith(minValue(colName), value), StartsWith(maxValue(colName), value)))) - case or: Or => - val resLeft = createZindexFilter(or.left, indexSchema) - val resRight = createZindexFilter(or.right, indexSchema) - Or(resLeft, resRight) - - case and: And => - val resLeft = createZindexFilter(and.left, indexSchema) - val resRight = createZindexFilter(and.right, indexSchema) - And(resLeft, resRight) - - case expr: Expression => - Literal.TrueLiteral - } - } - - /** - * Extracts name from a resolved expression referring to a nested or non-nested column. - */ - def getTargetColNameParts(resolvedTargetCol: Expression): Seq[String] = { - resolvedTargetCol match { - case attr: Attribute => Seq(attr.name) - - case Alias(c, _) => getTargetColNameParts(c) - - case GetStructField(c, _, Some(name)) => getTargetColNameParts(c) :+ name - - case ex: ExtractValue => - throw new AnalysisException(s"convert reference to name failed, Updating nested fields is only supported for StructType: ${ex}.") - - case other => - throw new AnalysisException(s"convert reference to name failed, Found unsupported expression ${other}") - } - } - - def createDataFrameInternal( - spark: SparkSession, - catalystRows: RDD[InternalRow], - schema: StructType, - isStreaming: Boolean = false): DataFrame = { - spark.internalCreateDataFrame(catalystRows, schema, isStreaming) - } - - def getIndexFiles(conf: Configuration, indexPath: String): Seq[FileStatus] = { - val basePath = new Path(indexPath) - basePath.getFileSystem(conf) - .listStatus(basePath).filterNot(f => shouldFilterOutPathName(f.getPath.getName)) - } - - /** - * read parquet files concurrently by local. - * this method is mush faster than spark - */ - def readParquetFile(spark: SparkSession, indexFiles: Seq[FileStatus], filters: Seq[Filter] = Nil, schemaOpts: Option[StructType] = None): Set[String] = { - val hadoopConf = spark.sparkContext.hadoopConfiguration - val partitionedFiles = indexFiles.map(f => PartitionedFile(InternalRow.empty, f.getPath.toString, 0, f.getLen)) - - val requiredSchema = new StructType().add("file", StringType, true) - val schema = schemaOpts.getOrElse(requiredSchema) - val parquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(spark - , schema , StructType(Nil), requiredSchema, filters, Map.empty, hadoopConf) - val results = new Array[Iterator[String]](partitionedFiles.size) - partitionedFiles.zipWithIndex.par.foreach { case (pf, index) => - val fileIterator = parquetReader(pf).asInstanceOf[Iterator[Any]] - val rows = fileIterator.flatMap(_ match { - case r: InternalRow => Seq(r) - case b: ColumnarBatch => b.rowIterator().asScala - }).map(r => r.getString(0)) - results(index) = rows - } - results.flatMap(f => f).toSet - } - - def shouldFilterOutPathName(pathName: String): Boolean = { - // We filter follow paths: - // 1. everything that starts with _ and ., except _common_metadata and metadata - // because Parquet needs to find those metadata files from leaf files returned by this method. - // We should refactor this logic to not mix metadata files with data files. - // 2. everything that ends with ._COPYING_, because this is a intermediate state of file. we - // should skip this file in case of double reading. - val exclude = (pathName.startsWith("") && !pathName.contains("=")) || - pathName.startsWith(".") || pathName.endsWith(".COPYING") - val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata") - exclude && !include - } - - /** - * update statistics info. - * this method will update old index table by full out join, - * and save the updated table into a new index table based on commitTime. - * old index table will be cleaned also. - */ - def saveStatisticsInfo( - df: DataFrame, - cols: String, - indexPath: String, - commitTime: String, - validateCommits: Seq[String]): Unit = { - val savePath = new Path(indexPath, commitTime) - val spark = df.sparkSession - val fs = savePath.getFileSystem(spark.sparkContext.hadoopConfiguration) - val statisticsDF = getMinMaxValue(df, cols) - // try to find last validate index table from index path - if (fs.exists(new Path(indexPath))) { - // find all the indexTable from .hoodie/.index - val allIndexTables = fs.listStatus(new Path(indexPath)).filter(_.isDirectory) - .map(_.getPath.getName) - val candidateIndexTables = allIndexTables.filter(f => validateCommits.contains(f)).sortBy(x => x).toList - val residualTables = allIndexTables.filter(f => !validateCommits.contains(f)) - - val optIndexDf = if (candidateIndexTables.isEmpty) { - None - } else { - try { - Some(spark.read.load(new Path(indexPath, candidateIndexTables.last).toString)) - } catch { - case _: Throwable => - None - } - } - // clean old index table, keep at most 1 index table - candidateIndexTables.dropRight(1).foreach(f => fs.delete(new Path(indexPath, f))) - // clean residualTables - // retried cluster operations at the same instant time is also considered, - // the residual files produced by retried are cleaned up before save statistics - // save statistics info to index table which named commitTime - residualTables.foreach(f => fs.delete(new Path(indexPath, f))) - if (optIndexDf.isDefined && optIndexDf.get.schema.equals(statisticsDF.schema)) { - val originalTable = "indexTable_" + java.util.UUID.randomUUID().toString.replace("-", "") - val updateTable = "updateTable_" + java.util.UUID.randomUUID().toString.replace("-", "") - optIndexDf.get.registerTempTable(originalTable) - statisticsDF.registerTempTable(updateTable) - // update table by full out join - val cols = optIndexDf.get.schema.map(_.name) - spark.sql(createSql(originalTable, updateTable, cols)).repartition(1).write.save(savePath.toString) - } else { - statisticsDF.repartition(1).write.mode("overwrite").save(savePath.toString) - } - } else { - statisticsDF.repartition(1).write.mode("overwrite").save(savePath.toString) - } - - } - - private def createSql(leftTable: String, rightTable: String, cols: Seq[String]): String = { - var selectsql = "" - for (i <- (0 to cols.size-1)) { - selectsql = selectsql + s" if (${leftTable}.${cols(0)} is null, ${rightTable}.${cols(i)}, ${leftTable}.${cols(i)}) as ${cols(i)} ," - } - "select " + selectsql.dropRight(1) + s" from ${leftTable} full join ${rightTable} on ${leftTable}.${cols(0)} = ${rightTable}.${cols(0)}" - } -} - diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala index 7a35da4d85156..da993b7545e53 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala @@ -20,9 +20,15 @@ package org.apache.spark.sql.hudi.execution import java.util +import org.apache.hudi.config.HoodieClusteringConfig import org.apache.spark.rdd.{PartitionPruningRDD, RDD} -import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, BoundReference, SortOrder, UnsafeProjection, UnsafeRow} import org.apache.hudi.optimize.ZOrderingUtil +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering +import org.apache.spark.sql.types._ +import org.apache.spark.util.MutablePair import org.apache.spark.util.random.SamplingUtils import scala.collection.mutable @@ -237,3 +243,284 @@ case class ZorderingBinarySort(b: Array[Byte]) extends Ordered[ZorderingBinarySo } } +object RangeSampleSort { + + /** + * create z-order DataFrame by sample + * support all col types + */ + def sortDataFrameBySampleSupportAllTypes(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val spark = df.sparkSession + val internalRdd = df.queryExecution.toRdd + val schema = df.schema + val outputAttributes = df.queryExecution.analyzed.output + val sortingExpressions = outputAttributes.filter(p => zCols.contains(p.name)) + if (sortingExpressions.length == 0 || sortingExpressions.length != zCols.size) { + df + } else { + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.key, + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.defaultValue.toString).toInt + + val sampleRdd = internalRdd.mapPartitionsInternal { iter => + val projection = UnsafeProjection.create(sortingExpressions, outputAttributes) + val mutablePair = new MutablePair[InternalRow, Null]() + // Internally, RangePartitioner runs a job on the RDD that samples keys to compute + // partition bounds. To get accurate samples, we need to copy the mutable keys. + iter.map(row => mutablePair.update(projection(row).copy(), null)) + } + + val orderings = sortingExpressions.map(SortOrder(_, Ascending)).zipWithIndex.map { case (ord, i) => + ord.copy(child = BoundReference(i, ord.dataType, ord.nullable)) + } + + val lazyGeneratedOrderings = orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + val sample = new RangeSample(zOrderBounds, sampleRdd) + + val rangeBounds = sample.getRangeBounds() + + implicit val ordering1 = lazyGeneratedOrderings(0) + + val sampleBounds = sample.determineRowBounds(rangeBounds, math.min(zOrderBounds, rangeBounds.length), lazyGeneratedOrderings, sortingExpressions) + + val origin_orderings = sortingExpressions.map(SortOrder(_, Ascending)).map { ord => + ord.copy(child = BoundReference(0, ord.dataType, ord.nullable)) + } + + val origin_lazyGeneratedOrderings = origin_orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size.toDouble + (bound, fillFactor) + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = internalRdd.mapPartitionsInternal { iter => + val boundsWithFactor = boundBroadCast.value + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + val maxBoundNum = boundsWithFactor.map(_._1.length).max + val origin_Projections = sortingExpressions.map { se => + UnsafeProjection.create(Seq(se), outputAttributes) + } + + iter.map { unsafeRow => + val interleaveValues = origin_Projections.zip(origin_lazyGeneratedOrderings).zipWithIndex.map { case ((rowProject, lazyOrdering), index) => + val row = rowProject(unsafeRow) + val decisionBound = new RawDecisionBound(lazyOrdering) + if (row.isNullAt(0)) { + maxBoundNum + 1 + } else { + val (bound, factor) = boundsWithFactor(index) + if (factor > 1) { + val currentRank = decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + currentRank*factor.toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + } + } + }.toArray.map(ZOrderingUtil.intTo8Byte(_)) + val zValues = ZOrderingUtil.interleaving(interleaveValues, 8) + val mutablePair = new MutablePair[InternalRow, Array[Byte]]() + + mutablePair.update(unsafeRow, zValues) + } + }.sortBy(x => ZorderingBinarySort(x._2), numPartitions = fileNum).map(_._1) + spark.internalCreateDataFrame(indexRdd, schema) + } + } + + /** + * create z-order DataFrame by sample + * first, sample origin data to get z-cols bounds, then create z-order DataFrame + * support all type data. + * this method need more resource and cost more time than createZIndexedDataFrameByMapValue + */ + def sortDataFrameBySample(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val spark = df.sparkSession + val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap + val fieldNum = df.schema.fields.length + val checkCols = zCols.filter(col => columnsMap(col) != null) + + if (zCols.isEmpty || checkCols.isEmpty) { + df + } else { + val zFields = zCols.map { col => + val newCol = columnsMap(col) + if (newCol == null) { + (-1, null) + } else { + newCol.dataType match { + case LongType | DoubleType | FloatType | StringType | IntegerType | DateType | TimestampType | ShortType | ByteType => + (df.schema.fields.indexOf(newCol), newCol) + case d: DecimalType => + (df.schema.fields.indexOf(newCol), newCol) + case _ => + (-1, null) + } + } + }.filter(_._1 != -1) + // Complex type found, use createZIndexedDataFrameByRange + if (zFields.length != zCols.length) { + return sortDataFrameBySampleSupportAllTypes(df, zCols, fieldNum) + } + + val rawRdd = df.rdd + val sampleRdd = rawRdd.map { row => + val values = zFields.map { case (index, field) => + field.dataType match { + case LongType => + if (row.isNullAt(index)) Long.MaxValue else row.getLong(index) + case DoubleType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getDouble(index)) + case IntegerType => + if (row.isNullAt(index)) Long.MaxValue else row.getInt(index).toLong + case FloatType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble) + case StringType => + if (row.isNullAt(index)) "" else row.getString(index) + case DateType => + if (row.isNullAt(index)) Long.MaxValue else row.getDate(index).getTime + case TimestampType => + if (row.isNullAt(index)) Long.MaxValue else row.getTimestamp(index).getTime + case ByteType => + if (row.isNullAt(index)) Long.MaxValue else row.getByte(index).toLong + case ShortType => + if (row.isNullAt(index)) Long.MaxValue else row.getShort(index).toLong + case d: DecimalType => + if (row.isNullAt(index)) Long.MaxValue else row.getDecimal(index).longValue() + case _ => + null + } + }.filter(v => v != null).toArray + (values, null) + } + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.key, + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.defaultValue.toString).toInt + val sample = new RangeSample(zOrderBounds, sampleRdd) + val rangeBounds = sample.getRangeBounds() + val sampleBounds = { + val candidateColNumber = rangeBounds.head._1.length + (0 to candidateColNumber - 1).map { i => + val colRangeBound = rangeBounds.map(x => (x._1(i), x._2)) + + if (colRangeBound.head._1.isInstanceOf[String]) { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(String, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[String]) + } else { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(Long, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[Long]) + } + } + } + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size + val newBound = new Array[Double](bound.length * fillFactor) + if (bound.isInstanceOf[Array[Long]] && fillFactor > 1) { + val longBound = bound.asInstanceOf[Array[Long]] + for (i <- 0 to bound.length - 1) { + for (j <- 0 to fillFactor - 1) { + // sample factor shoud not be too large, so it's ok to use 1 / fillfactor as slice + newBound(j + i*(fillFactor)) = longBound(i) + (j + 1) * (1 / fillFactor.toDouble) + } + } + (newBound, fillFactor) + } else { + (bound, 0) + } + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = rawRdd.mapPartitions { iter => + val expandBoundsWithFactor = boundBroadCast.value + val maxBoundNum = expandBoundsWithFactor.map(_._1.length).max + val longDecisionBound = new RawDecisionBound(Ordering[Long]) + val doubleDecisionBound = new RawDecisionBound(Ordering[Double]) + val stringDecisionBound = new RawDecisionBound(Ordering[String]) + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + + def getRank(rawIndex: Int, value: Long, isNull: Boolean): Int = { + val (expandBound, factor) = expandBoundsWithFactor(rawIndex) + if (isNull) { + expandBound.length + 1 + } else { + if (factor > 1) { + doubleDecisionBound.getBound(value + (threadLocalRandom.nextInt(factor) + 1)*(1 / factor.toDouble), expandBound.asInstanceOf[Array[Double]]) + } else { + longDecisionBound.getBound(value, expandBound.asInstanceOf[Array[Long]]) + } + } + } + + iter.map { row => + val values = zFields.zipWithIndex.map { case ((index, field), rawIndex) => + field.dataType match { + case LongType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getLong(index), isNull) + case DoubleType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getDouble(index)), isNull) + case IntegerType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getInt(index).toLong, isNull) + case FloatType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble), isNull) + case StringType => + val factor = maxBoundNum.toDouble / expandBoundsWithFactor(rawIndex)._1.length + if (row.isNullAt(index)) { + maxBoundNum + 1 + } else { + val currentRank = stringDecisionBound.getBound(row.getString(index), expandBoundsWithFactor(rawIndex)._1.asInstanceOf[Array[String]]) + if (factor > 1) { + (currentRank*factor).toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + currentRank + } + } + case DateType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDate(index).getTime, isNull) + case TimestampType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getTimestamp(index).getTime, isNull) + case ByteType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getByte(index).toLong, isNull) + case ShortType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getShort(index).toLong, isNull) + case d: DecimalType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDecimal(index).longValue(), isNull) + case _ => + -1 + } + }.filter(v => v != -1).map(ZOrderingUtil.intTo8Byte(_)).toArray + val zValues = ZOrderingUtil.interleaving(values, 8) + Row.fromSeq(row.toSeq ++ Seq(zValues)) + } + }.sortBy(x => ZorderingBinarySort(x.getAs[Array[Byte]](fieldNum)), numPartitions = fileNum) + val newDF = df.sparkSession.createDataFrame(indexRdd, StructType( + df.schema.fields ++ Seq( + StructField(s"zindex", + BinaryType, false)) + )) + newDF.drop("zindex") + } + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java new file mode 100644 index 0000000000000..ca977ae53b5f9 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.parquet.schema.PrimitiveStringifier; + +import java.util.Objects; + +/** + * Hoodie Range metadata. + */ +public class HoodieColumnRangeMetadata { + private final String filePath; + private final String columnName; + private final T minValue; + private final T maxValue; + private final long numNulls; + private final PrimitiveStringifier stringifier; + + public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls, final PrimitiveStringifier stringifier) { + this.filePath = filePath; + this.columnName = columnName; + this.minValue = minValue; + this.maxValue = maxValue; + this.numNulls = numNulls; + this.stringifier = stringifier; + } + + public String getFilePath() { + return this.filePath; + } + + public String getColumnName() { + return this.columnName; + } + + public T getMinValue() { + return this.minValue; + } + + public T getMaxValue() { + return this.maxValue; + } + + public PrimitiveStringifier getStringifier() { + return stringifier; + } + + public long getNumNulls() { + return numNulls; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final HoodieColumnRangeMetadata that = (HoodieColumnRangeMetadata) o; + return Objects.equals(getFilePath(), that.getFilePath()) + && Objects.equals(getColumnName(), that.getColumnName()) + && Objects.equals(getMinValue(), that.getMinValue()) + && Objects.equals(getMaxValue(), that.getMaxValue()) + && Objects.equals(getNumNulls(), that.getNumNulls()); + } + + @Override + public int hashCode() { + return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNumNulls()); + } + + @Override + public String toString() { + return "HoodieColumnRangeMetadata{" + + "filePath ='" + filePath + '\'' + + "columnName='" + columnName + '\'' + + ", minValue=" + minValue + + ", maxValue=" + maxValue + + ", numNulls=" + numNulls + '}'; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index ebe361025991c..c142e8a9608be 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.exception.HoodieIOException; @@ -41,12 +42,14 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.Function; +import java.util.stream.Collectors; /** * Utility functions involving with parquet. @@ -277,4 +280,59 @@ public Boolean apply(String recordKey) { return candidateKeys.contains(recordKey); } } + + /** + * Parse min/max statistics stored in parquet footers for all columns. + */ + public Collection> readRangeFromParquetMetadata(Configuration conf, Path parquetFilePath, List cols) { + ParquetMetadata metadata = readMetadata(conf, parquetFilePath); + // collect stats from all parquet blocks + Map>> columnToStatsListMap = metadata.getBlocks().stream().flatMap(blockMetaData -> { + return blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData -> + new HoodieColumnRangeMetadata<>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(), + columnChunkMetaData.getStatistics().genericGetMin(), + columnChunkMetaData.getStatistics().genericGetMax(), + columnChunkMetaData.getStatistics().getNumNulls(), + columnChunkMetaData.getPrimitiveType().stringifier())); + }).collect(Collectors.groupingBy(e -> e.getColumnName())); + + // we only intend to keep file level statistics. + return new ArrayList<>(columnToStatsListMap.values().stream() + .map(blocks -> getColumnRangeInFile(blocks)) + .collect(Collectors.toList())); + } + + private HoodieColumnRangeMetadata getColumnRangeInFile(final List> blockRanges) { + if (blockRanges.size() == 1) { + // only one block in parquet file. we can just return that range. + return blockRanges.get(0); + } else { + // there are multiple blocks. Compute min(block_mins) and max(block_maxs) + return blockRanges.stream().reduce((b1, b2) -> combineRanges(b1, b2)).get(); + } + } + + private HoodieColumnRangeMetadata combineRanges(HoodieColumnRangeMetadata range1, + HoodieColumnRangeMetadata range2) { + final Comparable minValue; + final Comparable maxValue; + if (range1.getMinValue() != null && range2.getMinValue() != null) { + minValue = range1.getMinValue().compareTo(range2.getMinValue()) < 0 ? range1.getMinValue() : range2.getMinValue(); + } else if (range1.getMinValue() == null) { + minValue = range2.getMinValue(); + } else { + minValue = range1.getMinValue(); + } + + if (range1.getMaxValue() != null && range2.getMaxValue() != null) { + maxValue = range1.getMaxValue().compareTo(range2.getMaxValue()) < 0 ? range2.getMaxValue() : range1.getMaxValue(); + } else if (range1.getMaxValue() == null) { + maxValue = range2.getMaxValue(); + } else { + maxValue = range1.getMaxValue(); + } + + return new HoodieColumnRangeMetadata<>(range1.getFilePath(), + range1.getColumnName(), minValue, maxValue, range1.getNumNulls() + range2.getNumNulls(), range1.getStringifier()); + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index dc8e7ed464ac6..ea7c424a0bca9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -110,6 +110,11 @@ object DataSourceReadOptions { .withDocumentation("The query instant for time travel. Without specified this option," + " we query the latest snapshot.") + val ENABLE_DATA_SKIPPING: ConfigProperty[Boolean] = ConfigProperty + .key("hoodie.enable.data.skipping") + .defaultValue(true) + .withDocumentation("enable data skipping to boost query after doing z-order optimize for current table") + /** @deprecated Use {@link QUERY_TYPE} and its methods instead */ @Deprecated val QUERY_TYPE_OPT_KEY = QUERY_TYPE.key() diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index 90b99ba62e118..297a561b7aa34 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -28,18 +28,18 @@ import org.apache.hudi.common.table.view.{FileSystemViewStorageConfig, HoodieTab import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.internal.Logging -import org.apache.spark.sql.{SparkSession, Zoptimize, Column} +import org.apache.spark.sql.{Column, SparkSession} import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BoundReference, Expression, InterpretedPredicate} import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory} -import org.apache.spark.sql.hudi.HoodieSqlUtils +import org.apache.spark.sql.hudi.{DataSkippingUtils, HoodieSqlUtils} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String - import java.util.Properties + import scala.collection.JavaConverters._ import scala.collection.JavaConversions._ import scala.collection.mutable @@ -88,8 +88,8 @@ case class HoodieFileIndex( /** * Get all completeCommits. */ - lazy val completeCommits = metaClient.getCommitsTimeline() - .filterCompletedInstants().getInstants().iterator().toList.map(_.getTimestamp) + lazy val completedCommits = metaClient.getCommitsTimeline + .filterCompletedInstants().getInstants.iterator().toList.map(_.getTimestamp) /** * Get the schema of the table. @@ -154,6 +154,11 @@ case class HoodieFileIndex( override def rootPaths: Seq[Path] = queryPath :: Nil + def enableDataSkipping(): Boolean = { + options.getOrElse(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), + spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean + } + private def createFilterFiles(dataFilters: Seq[Expression]): Set[String] = { var allFiles: Set[String] = Set.empty var candidateFiles: Set[String] = Set.empty @@ -162,7 +167,7 @@ case class HoodieFileIndex( if (fs.exists(new Path(indexPath)) && dataFilters.nonEmpty) { // try to load latest index table from index path val candidateIndexTables = fs.listStatus(new Path(indexPath)).filter(_.isDirectory) - .map(_.getPath.getName).filter(f => completeCommits.contains(f)).sortBy(x => x) + .map(_.getPath.getName).filter(f => completedCommits.contains(f)).sortBy(x => x) if (candidateIndexTables.nonEmpty) { val dataFrameOpt = try { Some(spark.read.load(new Path(indexPath, candidateIndexTables.last).toString)) @@ -174,12 +179,12 @@ case class HoodieFileIndex( if (dataFrameOpt.isDefined) { val indexSchema = dataFrameOpt.get.schema - val indexFiles = Zoptimize.getIndexFiles(spark.sparkContext.hadoopConfiguration, indexPath) - val indexFilter = dataFilters.map(Zoptimize.createZindexFilter(_, indexSchema)).reduce(And) + val indexFiles = DataSkippingUtils.getIndexFiles(spark.sparkContext.hadoopConfiguration, new Path(indexPath, candidateIndexTables.last).toString) + val indexFilter = dataFilters.map(DataSkippingUtils.createZindexFilter(_, indexSchema)).reduce(And) logInfo(s"index filter condition: ${indexFilter}") dataFrameOpt.get.persist() if (indexFiles.size <= 4) { - allFiles = Zoptimize.readParquetFile(spark, indexFiles) + allFiles = DataSkippingUtils.readParquetFile(spark, indexFiles) } else { allFiles = dataFrameOpt.get.select("file").collect().map(_.getString(0)).toSet } @@ -201,7 +206,11 @@ case class HoodieFileIndex( override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { // try to load filterFiles from index - val filterFiles: Set[String] = createFilterFiles(dataFilters) + val filterFiles: Set[String] = if (enableDataSkipping) { + createFilterFiles(dataFilters) + } else { + Set.empty + } if (queryAsNonePartitionedTable) { // Read as Non-Partitioned table. val candidateFiles = if (!filterFiles.isEmpty) { allFiles.filterNot(fileStatus => filterFiles.contains(fileStatus.getPath.getName)) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala new file mode 100644 index 0000000000000..45a7aec142d5a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, EqualNullSafe, EqualTo, Expression, ExtractValue, GetStructField, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Literal, Not, Or, StartsWith} +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.unsafe.types.UTF8String + +import scala.collection.JavaConverters._ + +object DataSkippingUtils { + + /** + * create z_index filter and push those filters to index table to filter all candidate scan files. + * @param condition origin filter from query. + * @param indexSchema schema from index table. + * @return filters for index table. + */ + def createZindexFilter(condition: Expression, indexSchema: StructType): Expression = { + def buildExpressionInternal(colName: Seq[String], statisticValue: String): Expression = { + val appendColName = UnresolvedAttribute(colName).name + statisticValue + col(appendColName).expr + } + + def reWriteCondition(colName: Seq[String], conditionExpress: Expression): Expression = { + val appendColName = UnresolvedAttribute(colName).name + "_minValue" + if (indexSchema.exists(p => p.name == appendColName)) { + conditionExpress + } else { + Literal.TrueLiteral + } + } + + val minValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_minValue") + val maxValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_maxValue") + val num_nulls = (colName: Seq[String]) => buildExpressionInternal(colName, "_num_nulls") + + condition match { + // query filter "colA = b" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table + case EqualTo(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) + // query filter "b = colA" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table + case EqualTo(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) + // query filter "colA = null" convert it to "colA_num_nulls = null" for index table + case equalNullSafe @ EqualNullSafe(_: AttributeReference, _ @ Literal(null, _)) => + val colName = getTargetColNameParts(equalNullSafe.left) + reWriteCondition(colName, EqualTo(num_nulls(colName), equalNullSafe.right)) + // query filter "colA < b" convert it to "colA_minValue < b" for index table + case LessThan(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName,LessThan(minValue(colName), value)) + // query filter "b < colA" convert it to "colA_maxValue > b" for index table + case LessThan(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(maxValue(colName), value)) + // query filter "colA > b" convert it to "colA_maxValue > b" for index table + case GreaterThan(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(maxValue(colName), value)) + // query filter "b > colA" convert it to "colA_minValue < b" for index table + case GreaterThan(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThan(minValue(colName), value)) + // query filter "colA <= b" convert it to "colA_minValue <= b" for index table + case LessThanOrEqual(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) + // query filter "b <= colA" convert it to "colA_maxValue >= b" for index table + case LessThanOrEqual(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThanOrEqual(maxValue(colName), value)) + // query filter "colA >= b" convert it to "colA_maxValue >= b" for index table + case GreaterThanOrEqual(attribute: AttributeReference, right: Literal) => + val colName = getTargetColNameParts(attribute) + GreaterThanOrEqual(maxValue(colName), right) + // query filter "b >= colA" convert it to "colA_minValue <= b" for index table + case GreaterThanOrEqual(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) + // query filter "colA is null" convert it to "colA_num_nulls > 0" for index table + case IsNull(attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(num_nulls(colName), Literal(0))) + // query filter "colA is not null" convert it to "colA_num_nulls = 0" for index table + case IsNotNull(attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, EqualTo(num_nulls(colName), Literal(0))) + // query filter "colA in (a,b)" convert it to " (colA_minValue <= a and colA_maxValue >= a) or (colA_minValue <= b and colA_maxValue >= b) " for index table + case In(attribute: AttributeReference, list: Seq[Literal]) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, list.map { lit => + And(LessThanOrEqual(minValue(colName), lit), GreaterThanOrEqual(maxValue(colName), lit)) + }.reduce(Or)) + // query filter "colA like xxx" convert it to " (colA_minValue <= xxx and colA_maxValue >= xxx) or (colA_min start with xxx or colA_max start with xxx) " for index table + case StartsWith(attribute, v @ Literal(_: UTF8String, _)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Or(And(LessThanOrEqual(minValue(colName), v), GreaterThanOrEqual(maxValue(colName), v)) , + Or(StartsWith(minValue(colName), v), StartsWith(maxValue(colName), v)))) + // query filter "colA not in (a, b)" convert it to " (not( colA_minValue = a and colA_maxValue = a)) and (not( colA_minValue = b and colA_maxValue = b)) " for index table + case Not(In(attribute: AttributeReference, list: Seq[Literal])) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, list.map { lit => + Not(And(EqualTo(minValue(colName), lit), EqualTo(maxValue(colName), lit))) + }.reduce(And)) + // query filter "colA != b" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table + case Not(EqualTo(attribute: AttributeReference, value: Literal)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) + // query filter "b != colA" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table + case Not(EqualTo(value: Literal, attribute: AttributeReference)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) + // query filter "colA not like xxxx" convert it to "not ( colA_minValue startWith xxx and colA_maxValue startWith xxx)" for index table + case Not(StartsWith(attribute, value @ Literal(_: UTF8String, _))) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(StartsWith(minValue(colName), value), StartsWith(maxValue(colName), value)))) + case or: Or => + val resLeft = createZindexFilter(or.left, indexSchema) + val resRight = createZindexFilter(or.right, indexSchema) + Or(resLeft, resRight) + + case and: And => + val resLeft = createZindexFilter(and.left, indexSchema) + val resRight = createZindexFilter(and.right, indexSchema) + And(resLeft, resRight) + + case expr: Expression => + Literal.TrueLiteral + } + } + + /** + * Extracts name from a resolved expression referring to a nested or non-nested column. + */ + def getTargetColNameParts(resolvedTargetCol: Expression): Seq[String] = { + resolvedTargetCol match { + case attr: Attribute => Seq(attr.name) + + case Alias(c, _) => getTargetColNameParts(c) + + case GetStructField(c, _, Some(name)) => getTargetColNameParts(c) :+ name + + case ex: ExtractValue => + throw new AnalysisException(s"convert reference to name failed, Updating nested fields is only supported for StructType: ${ex}.") + + case other => + throw new AnalysisException(s"convert reference to name failed, Found unsupported expression ${other}") + } + } + + def getIndexFiles(conf: Configuration, indexPath: String): Seq[FileStatus] = { + val basePath = new Path(indexPath) + basePath.getFileSystem(conf) + .listStatus(basePath).filterNot(f => f.getPath.getName.endsWith(".parquet")) + } + + /** + * read parquet files concurrently by local. + * this method is mush faster than spark + */ + def readParquetFile(spark: SparkSession, indexFiles: Seq[FileStatus], filters: Seq[Filter] = Nil, schemaOpts: Option[StructType] = None): Set[String] = { + val hadoopConf = spark.sparkContext.hadoopConfiguration + val partitionedFiles = indexFiles.map(f => PartitionedFile(InternalRow.empty, f.getPath.toString, 0, f.getLen)) + + val requiredSchema = new StructType().add("file", StringType, true) + val schema = schemaOpts.getOrElse(requiredSchema) + val parquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(spark + , schema , StructType(Nil), requiredSchema, filters, Map.empty, hadoopConf) + val results = new Array[Iterator[String]](partitionedFiles.size) + partitionedFiles.zipWithIndex.par.foreach { case (pf, index) => + val fileIterator = parquetReader(pf).asInstanceOf[Iterator[Any]] + val rows = fileIterator.flatMap(_ match { + case r: InternalRow => Seq(r) + case b: ColumnarBatch => b.rowIterator().asScala + }).map(r => r.getString(0)) + results(index) = rows + } + results.flatMap(f => f).toSet + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala index 583d0f7788d09..06ac600b0346e 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala @@ -18,17 +18,23 @@ package org.apache.hudi.functional +import java.sql.{Date, Timestamp} + +import org.apache.hadoop.fs.Path import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} -import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions} import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.testutils.HoodieClientTestBase +import org.apache.spark.ZCurveOptimizeHelper import org.apache.spark.sql._ +import org.apache.spark.sql.types._ import org.junit.jupiter.api.Assertions.assertEquals -import org.junit.jupiter.api.{AfterEach, BeforeEach} +import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.ValueSource import scala.collection.JavaConversions._ +import scala.util.Random class TestOptimizeTable extends HoodieClientTestBase { var spark: SparkSession = null @@ -76,11 +82,68 @@ class TestOptimizeTable extends HoodieClientTestBase { .option("hoodie.clustering.plan.strategy.small.file.limit", "629145600") .option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString) .option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 *1024 * 1024L)) - .option(HoodieClusteringConfig.SPACE_FILLING_CURVE_DATA_OPTIMIZE_ENABLE.key, "true") + .option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE.key, "true") .option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat, begin_lon") .mode(SaveMode.Overwrite) .save(basePath) assertEquals(1000, spark.read.format("hudi").load(basePath).count()) + assertEquals(1000, + spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true").format("hudi").load(basePath).count()) + } + + @Test + def testCollectMinMaxStatistics(): Unit = { + val testPath = new Path(System.getProperty("java.io.tmpdir"), "minMax") + val statisticPath = new Path(System.getProperty("java.io.tmpdir"), "stat") + val fs = testPath.getFileSystem(spark.sparkContext.hadoopConfiguration) + try { + val complexDataFrame = createComplexDataFrame(spark) + complexDataFrame.repartition(3).write.mode("overwrite").save(testPath.toString) + val df = spark.read.load(testPath.toString) + // do not support TimeStampType, so if we collect statistics for c4, should throw exception + val colDf = ZCurveOptimizeHelper.getMinMaxValue(df, "c1,c2,c3,c5,c6,c7,c8") + colDf.cache() + assertEquals(colDf.count(), 3) + assertEquals(colDf.take(1)(0).length, 22) + colDf.unpersist() + // try to save statistics + ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c2,c3,c5,c6,c7,c8", statisticPath.toString, "2", Seq("0", "1")) + // save again + ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c2,c3,c5,c6,c7,c8", statisticPath.toString, "3", Seq("0", "1", "2")) + // test old index table clean + ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c2,c3,c5,c6,c7,c8", statisticPath.toString, "4", Seq("0", "1", "3")) + assertEquals(!fs.exists(new Path(statisticPath, "2")), true) + assertEquals(fs.exists(new Path(statisticPath, "3")), true) + } finally { + if (fs.exists(testPath)) fs.delete(testPath) + if (fs.exists(statisticPath)) fs.delete(statisticPath) + } + } + + def createComplexDataFrame(spark: SparkSession): DataFrame = { + val schema = new StructType() + .add("c1", IntegerType) + .add("c2", StringType) + .add("c3", DecimalType(9,3)) + .add("c4", TimestampType) + .add("c5", ShortType) + .add("c6", DateType) + .add("c7", BinaryType) + .add("c8", ByteType) + + val rdd = spark.sparkContext.parallelize(0 to 1000, 1).map { item => + val c1 = Integer.valueOf(item) + val c2 = s" ${item}sdc" + val c3 = new java.math.BigDecimal(s"${Random.nextInt(1000)}.${item}") + val c4 = new Timestamp(System.currentTimeMillis()) + val c5 = java.lang.Short.valueOf(s"${(item + 16) /10}") + val c6 = Date.valueOf(s"${2020}-${item % 11 + 1}-${item % 28 + 1}") + val c7 = Array(item).map(_.toByte) + val c8 = java.lang.Byte.valueOf("9") + + RowFactory.create(c1, c2, c3, c4, c5, c6, c7, c8) + } + spark.createDataFrame(rdd, schema) } } From 7b975e5006aaafe4304a541bef7f9ea3ef9e4655 Mon Sep 17 00:00:00 2001 From: Vinoth Chandar Date: Tue, 2 Nov 2021 04:46:19 -0700 Subject: [PATCH 3/3] Minor code cleanups --- .../common/table/HoodieTableMetaClient.java | 2 +- .../org/apache/hudi/DataSourceOptions.scala | 1 + .../org/apache/hudi/HoodieFileIndex.scala | 20 +++++++++---------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index 47f27b161fc83..340a99ec2e208 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -79,7 +79,7 @@ public class HoodieTableMetaClient implements Serializable { public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux"; public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap"; public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat"; - public static final String ZINDEX_NAME = ".index"; + public static final String ZINDEX_NAME = ".zindex"; public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR + ".partitions"; public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index ea7c424a0bca9..94bcc0d0de85e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -113,6 +113,7 @@ object DataSourceReadOptions { val ENABLE_DATA_SKIPPING: ConfigProperty[Boolean] = ConfigProperty .key("hoodie.enable.data.skipping") .defaultValue(true) + .sinceVersion("0.10.0") .withDocumentation("enable data skipping to boost query after doing z-order optimize for current table") /** @deprecated Use {@link QUERY_TYPE} and its methods instead */ diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index 297a561b7aa34..882636c4697be 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -159,7 +159,7 @@ case class HoodieFileIndex( spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean } - private def createFilterFiles(dataFilters: Seq[Expression]): Set[String] = { + private def filterFilesByDataSkippingIndex(dataFilters: Seq[Expression]): Set[String] = { var allFiles: Set[String] = Set.empty var candidateFiles: Set[String] = Set.empty val indexPath = metaClient.getZindexPath @@ -181,7 +181,7 @@ case class HoodieFileIndex( val indexSchema = dataFrameOpt.get.schema val indexFiles = DataSkippingUtils.getIndexFiles(spark.sparkContext.hadoopConfiguration, new Path(indexPath, candidateIndexTables.last).toString) val indexFilter = dataFilters.map(DataSkippingUtils.createZindexFilter(_, indexSchema)).reduce(And) - logInfo(s"index filter condition: ${indexFilter}") + logInfo(s"index filter condition: $indexFilter") dataFrameOpt.get.persist() if (indexFiles.size <= 4) { allFiles = DataSkippingUtils.readParquetFile(spark, indexFiles) @@ -206,8 +206,8 @@ case class HoodieFileIndex( override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { // try to load filterFiles from index - val filterFiles: Set[String] = if (enableDataSkipping) { - createFilterFiles(dataFilters) + val filterFiles: Set[String] = if (enableDataSkipping()) { + filterFilesByDataSkippingIndex(dataFilters) } else { Set.empty } @@ -217,9 +217,9 @@ case class HoodieFileIndex( } else { allFiles } - logInfo(s"Total file size is: ${allFiles.size}," + - s" after file skip size is: ${candidateFiles.size} " + - s"skipping percent ${if (allFiles.length != 0) (allFiles.size - candidateFiles.size) / allFiles.size.toDouble else 0}") + logInfo(s"Total files : ${allFiles.size}," + + s" candidate files after data skipping: ${candidateFiles.size} " + + s" skipping percent ${if (allFiles.length != 0) (allFiles.size - candidateFiles.size) / allFiles.size.toDouble else 0}") Seq(PartitionDirectory(InternalRow.empty, candidateFiles)) } else { // Prune the partition path by the partition filters @@ -236,7 +236,7 @@ case class HoodieFileIndex( } }).filterNot(_ == null) val candidateFiles = if (!filterFiles.isEmpty) { - baseFileStatuses.filterNot(fileStatu => filterFiles.contains(fileStatu.getPath.getName)) + baseFileStatuses.filterNot(fileStatus => filterFiles.contains(fileStatus.getPath.getName)) } else { baseFileStatuses } @@ -244,8 +244,8 @@ case class HoodieFileIndex( candidateFileSize += candidateFiles.size PartitionDirectory(partition.values, candidateFiles) } - logInfo(s"Total file size is: ${totalFileSize}," + - s" after file skip size is: ${candidateFileSize} " + + logInfo(s"Total files: ${totalFileSize}," + + s" Candidate files after data skipping : ${candidateFileSize} " + s"skipping percent ${if (allFiles.length != 0) (totalFileSize - candidateFileSize) / totalFileSize.toDouble else 0}") result }