apache · yihua · Nov 27, 2021 · Nov 9, 2021 · Nov 25, 2021 · Nov 25, 2021
diff --git a/NOTICE b/NOTICE
@@ -159,3 +159,9 @@ its NOTICE file:
   This product includes software developed at
   StreamSets (http://www.streamsets.com/).
 
+--------------------------------------------------------------------------------
+
+This product includes code from hilbert-curve project
+ * Copyright https://github.com/davidmoten/hilbert-curve
+ * Licensed under the Apache-2.0 License
+
diff --git a/hudi-client/hudi-client-common/pom.xml b/hudi-client/hudi-client-common/pom.xml
@@ -64,6 +64,13 @@
       <artifactId>parquet-avro</artifactId>
     </dependency>
 
+    <!-- Hilbert Curve -->
+    <dependency>
+      <groupId>com.github.davidmoten</groupId>
+      <artifactId>hilbert-curve</artifactId>
+      <version>0.2.2</version>
+    </dependency>
+
     <!-- Dropwizard Metrics -->
     <dependency>
       <groupId>io.dropwizard.metrics</groupId>

diff --git a/...lient/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/...lient/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java
@@ -542,4 +542,32 @@ public static BuildCurveStrategyType fromValue(String value) {
       }
     }
   }
+
+  /**
+   * strategy types for optimize layout for hudi data.
+   */
+  public enum BuildLayoutOptimizationStrategy {
+    ZORDER("z-order"),
+    HILBERT("hilbert");
+    private final String value;
+
+    BuildLayoutOptimizationStrategy(String value) {
+      this.value = value;
+    }
+
+    public String toCustomString() {
+      return value;
+    }
+
+    public static BuildLayoutOptimizationStrategy fromValue(String value) {
+      switch (value.toLowerCase(Locale.ROOT)) {
+        case "z-order":
+          return ZORDER;
+        case "hilbert":
+          return HILBERT;
+        default:
+          throw new HoodieException("Invalid value of Type.");
+      }
+    }
+  }
 }
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/HilbertCurveUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/HilbertCurveUtils.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.optimize;
+
+import org.davidmoten.hilbert.HilbertCurve;
+
+import java.math.BigInteger;
+
+/**
+ * Utils for Hilbert Curve.
+ */
+public class HilbertCurveUtils {
+  public static byte[] indexBytes(HilbertCurve hilbertCurve, long[] points, int paddingNum) {
+    BigInteger index = hilbertCurve.index(points);
+    return paddingToNByte(index.toByteArray(), paddingNum);
+  }
+
+  public static byte[] paddingToNByte(byte[] a, int paddingNum) {
+    if (a.length == paddingNum) {
+      return a;
+    }
+    if (a.length > paddingNum) {
+      byte[] result = new byte[paddingNum];
+      System.arraycopy(a, 0, result, 0, paddingNum);
+      return result;
+    }
+    int paddingSize = paddingNum - a.length;
+    byte[] result = new byte[paddingNum];
+    for (int i = 0; i < paddingSize; i++) {
+      result[i] = 0;
+    }
+    System.arraycopy(a, 0, result, paddingSize, a.length);
+    return result;
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java
@@ -176,9 +176,14 @@ public static byte[] utf8To8Byte(String a) {
 
   public static Long convertStringToLong(String a) {
     byte[] bytes = utf8To8Byte(a);
+    return convertBytesToLong(bytes);
+  }
+
+  public static long convertBytesToLong(byte[] bytes) {
+    byte[] paddedBytes = paddingTo8Byte(bytes);
     long temp = 0L;
     for (int i = 7; i >= 0; i--) {
-      temp = temp | (((long)bytes[i] & 0xff) << (7 - i) * 8);
+      temp = temp | (((long) paddedBytes[i] & 0xff) << (7 - i) * 8);
     }
     return temp;
   }

diff --git a/...ient/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestHilbertCurveUtils.java b/...ient/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestHilbertCurveUtils.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.optimize;
+
+import org.davidmoten.hilbert.HilbertCurve;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class TestHilbertCurveUtils {
+
+  private static final HilbertCurve INSTANCE = HilbertCurve.bits(5).dimensions(2);
+
+  @Test
+  public void testIndex() {
+    long[] t = {1, 2};
+    assertEquals(13, INSTANCE.index(t).intValue());
+    long[] t1 = {0, 16};
+    assertEquals(256, INSTANCE.index(t1).intValue());
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java
@@ -126,4 +126,29 @@ public OrginValueWrapper(T index, T originValue) {
       this.originValue = originValue;
     }
   }
+
+  @Test
+  public void testConvertBytesToLong() {
+    long[] tests = new long[] {Long.MIN_VALUE, -1L, 0, 1L, Long.MAX_VALUE};
+    for (int i = 0; i < tests.length; i++) {
+      assertEquals(ZOrderingUtil.convertBytesToLong(convertLongToBytes(tests[i])), tests[i]);
+    }
+  }
+
+  @Test
+  public void testConvertBytesToLongWithPadding() {
+    byte[] bytes = new byte[2];
+    bytes[0] = 2;
+    bytes[1] = 127;
+    assertEquals(ZOrderingUtil.convertBytesToLong(bytes), 2 * 256 + 127);
+  }
+
+  private byte[] convertLongToBytes(long num) {
+    byte[] byteNum = new byte[8];
+    for (int i = 0; i < 8; i++) {
+      int offset = 64 - (i + 1) * 8;
+      byteNum[i] = (byte) ((num >> offset) & 0xff);
+    }
+    return byteNum;
+  }
 }
diff --git a/...java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java b/...java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java
@@ -33,7 +33,7 @@
 
 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericRecord;
-import org.apache.hudi.index.zorder.ZOrderingIndexHelper;
+import org.apache.spark.OrderingIndexHelper;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@@ -79,10 +79,12 @@ private JavaRDD<GenericRecord> prepareGenericRecord(JavaRDD<HoodieRecord<T>> inp
 
     switch (config.getLayoutOptimizationCurveBuildMethod()) {
       case DIRECT:
-        zDataFrame = ZOrderingIndexHelper.createZIndexedDataFrameByMapValue(originDF, config.getClusteringSortColumns(), numOutputGroups);
+        zDataFrame = OrderingIndexHelper
+            .createOptimizedDataFrameByMapValue(originDF, config.getClusteringSortColumns(), numOutputGroups, config.getLayoutOptimizationStrategy());
         break;
       case SAMPLE:
-        zDataFrame = ZOrderingIndexHelper.createZIndexedDataFrameBySample(originDF, config.getClusteringSortColumns(), numOutputGroups);
+        zDataFrame = OrderingIndexHelper
+            .createOptimizeDataFrameBySample(originDF, config.getClusteringSortColumns(), numOutputGroups, config.getLayoutOptimizationStrategy());
         break;
       default:
         throw new HoodieException("Not a valid build curve method for doWriteOperation: ");

diff --git a/...nt/hudi-spark-client/src/main/java/org/apache/hudi/index/zorder/ZOrderingIndexHelper.java b/...nt/hudi-spark-client/src/main/java/org/apache/hudi/index/zorder/ZOrderingIndexHelper.java
@@ -18,17 +18,19 @@
 
 package org.apache.hudi.index.zorder;
 
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
 import org.apache.hudi.common.model.HoodieFileFormat;
 import org.apache.hudi.common.util.BaseFileUtils;
 import org.apache.hudi.common.util.ParquetUtils;
 import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.config.HoodieClusteringConfig;
 import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.optimize.ZOrderingUtil;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.parquet.io.api.Binary;
@@ -62,10 +64,10 @@
 import org.apache.spark.sql.types.StructType$;
 import org.apache.spark.sql.types.TimestampType;
 import org.apache.spark.util.SerializableConfiguration;
-import scala.collection.JavaConversions;
 
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
+
 import java.io.IOException;
 import java.math.BigDecimal;
 import java.util.ArrayList;
@@ -77,6 +79,8 @@
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
 
+import scala.collection.JavaConversions;
+
 import static org.apache.hudi.util.DataTypeUtils.areCompatible;
 
 public class ZOrderingIndexHelper {
@@ -189,7 +193,8 @@ public static Dataset<Row> createZIndexedDataFrameByMapValue(Dataset<Row> df, St
   }
 
   public static Dataset<Row> createZIndexedDataFrameBySample(Dataset<Row> df, List<String> zCols, int fileNum) {
-    return RangeSampleSort$.MODULE$.sortDataFrameBySample(df, JavaConversions.asScalaBuffer(zCols), fileNum);
+    return RangeSampleSort$.MODULE$.sortDataFrameBySample(df, JavaConversions.asScalaBuffer(zCols), fileNum,
+        HoodieClusteringConfig.BuildLayoutOptimizationStrategy.ZORDER.toCustomString());
   }
 
   public static Dataset<Row> createZIndexedDataFrameBySample(Dataset<Row> df, String zCols, int fileNum) {
@@ -584,7 +589,7 @@ private static String composeZIndexColName(String col, String statName) {
    * @VisibleForTesting
    */
   @Nonnull
-  static String createIndexMergeSql(
+  public static String createIndexMergeSql(
       @Nonnull String originalIndexTable,
       @Nonnull String newIndexTable,
       @Nonnull List<String> columns

diff --git a/...nt/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/...nt/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java
@@ -18,8 +18,6 @@
 
 package org.apache.hudi.table;
 
-import org.apache.avro.Schema;
-import org.apache.hadoop.fs.Path;
 import org.apache.hudi.AvroConversionUtils;
 import org.apache.hudi.avro.HoodieAvroUtils;
 import org.apache.hudi.avro.model.HoodieCleanMetadata;
@@ -49,6 +47,7 @@
 import org.apache.hudi.exception.HoodieIOException;
 import org.apache.hudi.exception.HoodieNotSupportedException;
 import org.apache.hudi.exception.HoodieUpsertException;
+import org.apache.hudi.index.zorder.ZOrderingIndexHelper;
 import org.apache.hudi.io.HoodieCreateHandle;
 import org.apache.hudi.io.HoodieMergeHandle;
 import org.apache.hudi.io.HoodieSortedMergeHandle;
@@ -76,12 +75,15 @@
 import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor;
 import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor;
 import org.apache.hudi.table.action.savepoint.SavepointActionExecutor;
+
+import org.apache.avro.Schema;
+import org.apache.hadoop.fs.Path;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
-import org.apache.hudi.index.zorder.ZOrderingIndexHelper;
 import org.apache.spark.api.java.JavaRDD;
 
 import javax.annotation.Nonnull;
+
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collections;