diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java index 823c7a37ba37..d20fb0c17534 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java @@ -78,6 +78,9 @@ private InputFormatConfig() { public static final String CATALOG_CONFIG_PREFIX = "iceberg.catalog."; + public static final String SORT_ORDER = "sort.order"; + public static final String SORT_COLUMNS = "sort.columns"; + public enum InMemoryDataModel { HIVE, GENERIC // Default data model is of Iceberg Generics diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java index 58fd7a7bd589..021738bb82ae 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java @@ -19,6 +19,7 @@ package org.apache.iceberg.mr.hive; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import java.util.Arrays; import java.util.Collection; @@ -41,6 +42,8 @@ import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.ddl.misc.sortoder.SortFieldDesc; import org.apache.hadoop.hive.ql.ddl.misc.sortoder.SortFields; +import org.apache.hadoop.hive.ql.ddl.misc.sortoder.ZOrderFieldDesc; +import org.apache.hadoop.hive.ql.ddl.misc.sortoder.ZOrderFields; import org.apache.hadoop.hive.ql.util.NullOrdering; import org.apache.iceberg.BaseMetastoreTableOperations; import org.apache.iceberg.BaseTable; @@ -74,6 +77,8 @@ import org.slf4j.LoggerFactory; import static org.apache.iceberg.RowLevelOperationMode.MERGE_ON_READ; +import static org.apache.iceberg.mr.InputFormatConfig.SORT_COLUMNS; +import static org.apache.iceberg.mr.InputFormatConfig.SORT_ORDER; public class BaseHiveIcebergMetaHook implements HiveMetaHook { private static final Logger LOG = LoggerFactory.getLogger(BaseHiveIcebergMetaHook.class); @@ -217,28 +222,83 @@ private void validateCatalogConfigsDefined() { } } + /** + * Persists the table's write sort order based on the HMS property 'default-sort-order' + * that is populated by the DDL layer. + *

+ * Behaviour: + * - If the JSON represents Z-order, we remove DEFAULT_SORT_ORDER + * as Iceberg does not have Z-order support in its spec. + * So, we persist Z-order metadata in {@link org.apache.iceberg.mr.InputFormatConfig#SORT_ORDER} + * and {@link org.apache.iceberg.mr.InputFormatConfig#SORT_COLUMNS} to be used by Hive Writer. + *

+ * - Otherwise, the JSON is a list of SortFields; we convert it to Iceberg + * SortOrder JSON and keep it in DEFAULT_SORT_ORDER for Iceberg to use it. + */ private void setSortOrder(org.apache.hadoop.hive.metastore.api.Table hmsTable, Schema schema, Properties properties) { - String sortOderJSONString = hmsTable.getParameters().get(TableProperties.DEFAULT_SORT_ORDER); - SortFields sortFields = null; - if (!Strings.isNullOrEmpty(sortOderJSONString)) { - try { - sortFields = JSON_OBJECT_MAPPER.reader().readValue(sortOderJSONString, SortFields.class); - } catch (Exception e) { - LOG.warn("Can not read write order json: {}", sortOderJSONString, e); - return; - } + String sortOrderJSONString = hmsTable.getParameters().get(TableProperties.DEFAULT_SORT_ORDER); + if (Strings.isNullOrEmpty(sortOrderJSONString)) { + return; + } + + if (isZOrderJSON(sortOrderJSONString)) { + properties.remove(TableProperties.DEFAULT_SORT_ORDER); + setZOrderSortOrder(sortOrderJSONString, properties); + return; + } + + try { + SortFields sortFields = JSON_OBJECT_MAPPER.reader().readValue(sortOrderJSONString, SortFields.class); if (sortFields != null && !sortFields.getSortFields().isEmpty()) { - SortOrder.Builder sortOderBuilder = SortOrder.builderFor(schema); + SortOrder.Builder sortOrderBuilder = SortOrder.builderFor(schema); sortFields.getSortFields().forEach(fieldDesc -> { NullOrder nullOrder = fieldDesc.getNullOrdering() == NullOrdering.NULLS_FIRST ? - NullOrder.NULLS_FIRST : NullOrder.NULLS_LAST; + NullOrder.NULLS_FIRST : NullOrder.NULLS_LAST; SortDirection sortDirection = fieldDesc.getDirection() == SortFieldDesc.SortDirection.ASC ? - SortDirection.ASC : SortDirection.DESC; - sortOderBuilder.sortBy(fieldDesc.getColumnName(), sortDirection, nullOrder); + SortDirection.ASC : SortDirection.DESC; + sortOrderBuilder.sortBy(fieldDesc.getColumnName(), sortDirection, nullOrder); }); - properties.put(TableProperties.DEFAULT_SORT_ORDER, SortOrderParser.toJson(sortOderBuilder.build())); + properties.put(TableProperties.DEFAULT_SORT_ORDER, SortOrderParser.toJson(sortOrderBuilder.build())); } + } catch (Exception e) { + LOG.warn("Can not read write order json: {}", sortOrderJSONString); + } + } + + /** + * Configures the Z-order sort order metadata in the given properties + * based on the specified Z-order fields. + * + * @param jsonString the JSON string representing sort orders + * @param properties the Properties object to store sort order metadata + */ + private void setZOrderSortOrder(String jsonString, Properties properties) { + try { + ZOrderFields zorderFields = JSON_OBJECT_MAPPER.reader().readValue(jsonString, ZOrderFields.class); + if (zorderFields != null && !zorderFields.getZOrderFields().isEmpty()) { + List columnNames = zorderFields.getZOrderFields().stream() + .map(ZOrderFieldDesc::getColumnName) + .collect(Collectors.toList()); + + LOG.info("Setting Z-order sort order for columns: {}", columnNames); + + properties.put(SORT_ORDER, "ZORDER"); + properties.put(SORT_COLUMNS, String.join(",", columnNames)); + + LOG.info("Z-order sort order configured for Iceberg table with columns: {}", columnNames); + } + } catch (Exception e) { + LOG.warn("Failed to parse Z-order sort order", e); + } + } + + private boolean isZOrderJSON(String jsonString) { + try { + JsonNode node = JSON_OBJECT_MAPPER.readTree(jsonString); + return node.has("zorderFields"); + } catch (Exception e) { + return false; } } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index 6dc92ff411e7..15132bb17e7f 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -82,6 +82,7 @@ import org.apache.hadoop.hive.ql.ddl.table.create.like.CreateTableLikeDesc; import org.apache.hadoop.hive.ql.ddl.table.misc.properties.AlterTableSetPropertiesDesc; import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.io.IOConstants; @@ -119,6 +120,7 @@ import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionStateUtil; import org.apache.hadoop.hive.ql.stats.Partish; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.util.NullOrdering; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.Deserializer; @@ -184,6 +186,7 @@ import org.apache.iceberg.mr.InputFormatConfig; import org.apache.iceberg.mr.hive.actions.HiveIcebergDeleteOrphanFiles; import org.apache.iceberg.mr.hive.plan.IcebergBucketFunction; +import org.apache.iceberg.mr.hive.udf.GenericUDFIcebergZorder; import org.apache.iceberg.puffin.Blob; import org.apache.iceberg.puffin.BlobMetadata; import org.apache.iceberg.puffin.Puffin; @@ -218,6 +221,8 @@ import static org.apache.iceberg.SnapshotSummary.TOTAL_FILE_SIZE_PROP; import static org.apache.iceberg.SnapshotSummary.TOTAL_POS_DELETES_PROP; import static org.apache.iceberg.SnapshotSummary.TOTAL_RECORDS_PROP; +import static org.apache.iceberg.mr.InputFormatConfig.SORT_COLUMNS; +import static org.apache.iceberg.mr.InputFormatConfig.SORT_ORDER; public class HiveIcebergStorageHandler extends DefaultStorageHandler implements HiveStoragePredicateHandler { private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergStorageHandler.class); @@ -929,9 +934,64 @@ public DynamicPartitionCtx createDPContext( addCustomSortExpr(table, hmsTable, writeOperation, dpCtx, getSortTransformSpec(table)); } + // Even if table has no explicit sort order, honor z-order if configured + Map props = table.properties(); + if ("ZORDER".equalsIgnoreCase(props.getOrDefault(SORT_ORDER, ""))) { + createZOrderCustomSort(props, dpCtx, table, hmsTable, writeOperation); + } + return dpCtx; } + /** + * Adds a custom sort expression to the DynamicPartitionCtx that performs local Z-ordering on write. + * + * Behavior: + * - Reads Z-order properties from 'sort.order' and 'sort.columns' (comma-separated). + * - Resolves the referenced columns to their positions in the physical row (taking into account + * ACID virtual columns offset for overwrite/update operations). + * - Configures a single ASC sort key with NULLS FIRST and injects a custom key expression for + * Z-order + */ + private void createZOrderCustomSort(Map props, DynamicPartitionCtx dpCtx, Table table, + org.apache.hadoop.hive.ql.metadata.Table hmsTable, Operation writeOperation) { + String colsProp = props.get(SORT_COLUMNS); + if (StringUtils.isNotBlank(colsProp)) { + List zCols = Arrays.stream(colsProp.split(",")).map(String::trim) + .filter(s -> !s.isEmpty()).collect(Collectors.toList()); + + Map fieldOrderMap = Maps.newHashMap(); + List fields = table.schema().columns(); + for (int i = 0; i < fields.size(); ++i) { + fieldOrderMap.put(fields.get(i).name(), i); + } + int offset = (shouldOverwrite(hmsTable, writeOperation) ? + ACID_VIRTUAL_COLS_AS_FIELD_SCHEMA : acidSelectColumns(hmsTable, writeOperation)).size(); + + List zIndices = zCols.stream().map(col -> { + Integer base = fieldOrderMap.get(col); + Preconditions.checkArgument(base != null, "Z-order column not found in schema: %s", col); + return base + offset; + }).collect(Collectors.toList()); + + dpCtx.setCustomSortOrder(Lists.newArrayList(Collections.singletonList(1))); + dpCtx.setCustomSortNullOrder(Lists.newArrayList(Collections.singletonList(NullOrdering.NULLS_FIRST.getCode()))); + + dpCtx.addCustomSortExpressions(Collections.singletonList(allCols -> { + List args = Lists.newArrayListWithExpectedSize(zIndices.size()); + for (Integer idx : zIndices) { + args.add(allCols.get(idx)); + } + try { + GenericUDF udf = new GenericUDFIcebergZorder(); + return ExprNodeGenericFuncDesc.newInstance(udf, "iceberg_zorder", args); + } catch (UDFArgumentException e) { + throw new RuntimeException(e); + } + })); + } + } + private void addCustomSortExpr(Table table, org.apache.hadoop.hive.ql.metadata.Table hmsTable, Operation writeOperation, DynamicPartitionCtx dpCtx, List transformSpecs) { diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/udf/GenericUDFIcebergZorder.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/udf/GenericUDFIcebergZorder.java new file mode 100644 index 000000000000..92fb2b7c8b26 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/udf/GenericUDFIcebergZorder.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mr.hive.udf; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.iceberg.util.ZOrderByteUtils; + +/** + * Hive UDF to compute the Z-order value of given input columns using Iceberg's ZOrderByteUtils. + * Supports various primitive types and converts inputs into interleaved binary representation. + */ +@Description(name = "iceberg_zorder", + value = "_FUNC_(value) - " + + "Returns the z-value calculated by Iceberg ZOrderByteUtils class") +public class GenericUDFIcebergZorder extends GenericUDF { + private PrimitiveObjectInspector[] argOIs; + // For variable-length types (e.g., strings), how many bytes contribute to z-order + private final int varLengthContribution = 8; + private transient ByteBuffer[] reUseBuffer; + private static final int MAX_OUTPUT_SIZE = Integer.MAX_VALUE; + // Zero-filled byte array for representing NULL values + private static final byte[] NULL_ORDERED_BYTES = new byte[ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE]; + + /** + * Initializes the UDF, validating argument types are primitives and preparing buffers. + */ + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length < 2) { + throw new UDFArgumentException("iceberg_zorder requires at least 2 arguments"); + } + argOIs = new PrimitiveObjectInspector[arguments.length]; + reUseBuffer = new ByteBuffer[arguments.length]; + for (int i = 0; i < arguments.length; i++) { + if (!(arguments[i] instanceof PrimitiveObjectInspector poi)) { + throw new UDFArgumentTypeException(i, "Only primitive types supported for z-order"); + } + argOIs[i] = poi; + } + return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; + } + + /** + * Evaluates the UDF by converting input values to ordered bytes, interleaving them, + * and returning the resulting Z-order binary value. + */ + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + byte[][] inputs = new byte[arguments.length][]; + int totalLength = 0; + + for (int i = 0; i < arguments.length; i++) { + byte[] orderedBytes = convertToOrderedBytes(arguments[i].get(), argOIs[i], i); + inputs[i] = orderedBytes; + totalLength += orderedBytes.length; + } + + int outputLength = Math.min(totalLength, MAX_OUTPUT_SIZE); + ByteBuffer buffer = ByteBuffer.allocate(outputLength); + + byte[] interleaved = ZOrderByteUtils.interleaveBits(inputs, outputLength, buffer); + return new BytesWritable(interleaved); + } + + @Override + public String getDisplayString(String[] children) { + return "iceberg_zorder(" + String.join(", ", children) + ")"; + } + + /** + * Converts a single input value to its ordered byte representation based on type. + * @return fixed-length byte arrays to be used in interleaving. + */ + private byte[] convertToOrderedBytes(Object value, PrimitiveObjectInspector oi, + int position) throws HiveException { + if (value == null) { + return NULL_ORDERED_BYTES; + } + + if (reUseBuffer[position] == null) { + reUseBuffer[position] = ByteBuffer.allocate(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + } + switch (oi.getPrimitiveCategory()) { + case BOOLEAN: + boolean boolValue = (Boolean) oi.getPrimitiveJavaObject(value); + return ZOrderByteUtils.intToOrderedBytes(boolValue ? 1 : 0, reUseBuffer[position]).array(); + + case BYTE: + byte byteValue = (Byte) oi.getPrimitiveJavaObject(value); + return ZOrderByteUtils.tinyintToOrderedBytes(byteValue, reUseBuffer[position]).array(); + + case SHORT: + short shortValue = (Short) oi.getPrimitiveJavaObject(value); + return ZOrderByteUtils.shortToOrderedBytes(shortValue, reUseBuffer[position]).array(); + + case INT: + int intValue = (Integer) oi.getPrimitiveJavaObject(value); + return ZOrderByteUtils.intToOrderedBytes(intValue, reUseBuffer[position]).array(); + + case LONG: + long longValue = (Long) oi.getPrimitiveJavaObject(value); + return ZOrderByteUtils.longToOrderedBytes(longValue, reUseBuffer[position]).array(); + + case FLOAT: + float floatValue = (Float) oi.getPrimitiveJavaObject(value); + return ZOrderByteUtils.floatToOrderedBytes(floatValue, reUseBuffer[position]).array(); + + case DOUBLE: + double doubleValue = (Double) oi.getPrimitiveJavaObject(value); + return ZOrderByteUtils.doubleToOrderedBytes(doubleValue, reUseBuffer[position]).array(); + + case DATE: + // Convert DATE to epoch days (days since 1970-01-01 UTC) + Object dateValue = oi.getPrimitiveJavaObject(value); + long epochDays; + if (dateValue instanceof java.sql.Date dd) { + epochDays = dd.toLocalDate().toEpochDay(); + } else if (dateValue instanceof org.apache.hadoop.hive.common.type.Date dd) { + epochDays = dd.toEpochDay(); + } else { + throw new HiveException("Unsupported DATE backing type: " + dateValue.getClass()); + } + return ZOrderByteUtils.longToOrderedBytes(epochDays, reUseBuffer[position]).array(); + + case TIMESTAMP: + Object tsValue = oi.getPrimitiveJavaObject(value); + long tsInMillis; + if (tsValue instanceof org.apache.hadoop.hive.common.type.Timestamp ts) { + tsInMillis = ts.toEpochMilli(); + } else if (tsValue instanceof java.sql.Timestamp ts) { + tsInMillis = ts.getTime(); + } else { + throw new HiveException("Unsupported TIMESTAMP backing type: " + tsValue.getClass()); + } + return ZOrderByteUtils.longToOrderedBytes(tsInMillis, reUseBuffer[position]).array(); + + case CHAR: + case VARCHAR: + case STRING: + String strVal = String.valueOf(oi.getPrimitiveJavaObject(value)); + return ZOrderByteUtils.stringToOrderedBytes(strVal, varLengthContribution, + reUseBuffer[position], StandardCharsets.UTF_8.newEncoder()).array(); + + default: + throw new HiveException("Unsupported type in z-order: " + oi.getPrimitiveCategory()); + } + } +} diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_create_locally_zordered_table.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_create_locally_zordered_table.q new file mode 100644 index 000000000000..88c98abeabbb --- /dev/null +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_create_locally_zordered_table.q @@ -0,0 +1,110 @@ +-- Mask neededVirtualColumns due to non-strict order +--! qt:replace:/(\s+neededVirtualColumns:\s)(.*)/$1#Masked#/ +-- Mask the totalSize value as it can have slight variability, causing test flakiness +--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#$2/ +-- Mask random uuid +--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/ +-- Mask a random snapshot id +--! qt:replace:/(\s+current-snapshot-id\s+)\S+(\s*)/$1#Masked#/ +-- Mask added file size +--! qt:replace:/(\S\"added-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ +-- Mask total file size +--! qt:replace:/(\S\"total-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ +-- Mask removed file size +--! qt:replace:/(\S\"removed-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ +-- Mask current-snapshot-timestamp-ms +--! qt:replace:/(\s+current-snapshot-timestamp-ms\s+)\S+(\s*)/$1#Masked#$2/ +--! qt:replace:/(MAJOR\s+succeeded\s+)[a-zA-Z0-9\-\.\s+]+(\s+manual)/$1#Masked#$2/ +-- Mask iceberg version +--! qt:replace:/(\S\"iceberg-version\\\":\\\")(\w+\s\w+\s\d+\.\d+\.\d+\s\(\w+\s\w+\))(\\\")/$1#Masked#$3/ +set hive.llap.io.enabled=true; +set hive.vectorized.execution.enabled=true; +set hive.optimize.shared.work.merge.ts.schema=true; + +-- Validates z-order on CREATE via clause. +CREATE TABLE default.zorder_it_nulls ( + id int, + text string) +WRITE LOCALLY ZORDER by (id, text) +STORED BY iceberg +STORED As orc; + +DESCRIBE FORMATTED default.zorder_it_nulls; +EXPLAIN INSERT INTO default.zorder_it_nulls VALUES (3, "3"),(2, "2"),(4, "4"),(5, "5"),(1, "1"),(2, "3"),(3,null),(2,null),(null,"a"); +INSERT INTO default.zorder_it_nulls VALUES + (3, "3"), + (2, "2"), + (4, "4"), + (5, "5"), + (1, "1"), + (2, "3"), + (3,null), + (2,null), + (null,"a"); + +SELECT * FROM default.zorder_it_nulls; +DROP TABLE default.zorder_it_nulls; + + +CREATE TABLE default.zorder_dit ( + id int, + text string, + bool_val boolean, + date_val date) +WRITE LOCALLY ZORDER by (date_val, id, text) +STORED BY iceberg +STORED As orc; + +DESCRIBE FORMATTED default.zorder_dit; +INSERT INTO default.zorder_dit VALUES + (0, 'a', false, DATE '2023-01-01'), + (255, 'z', true, DATE '2025-12-31'), + (0, 'z', false, DATE '2025-12-31'), + (255, 'a', true, DATE '2023-01-01'), + (128, 'm', true, DATE '2024-06-01'), + (64, 'c', false, DATE '2023-06-01'), + (192, 'x', true, DATE '2025-01-01'), + (32, 'b', true, DATE '2023-03-01'), + (96, 'd', false, DATE '2023-09-01'), + (160, 'v', true, DATE '2024-09-01'); + +SELECT * FROM default.zorder_dit; +DROP TABLE default.zorder_dit; + +CREATE TABLE default.zorder_tsdl ( + ts timestamp, + dd double, + ll bigint) +WRITE LOCALLY ZORDER by (ts, dd, ll) +STORED BY iceberg +STORED As orc; + +DESCRIBE FORMATTED default.zorder_tsdl; +INSERT INTO default.zorder_tsdl VALUES + (TIMESTAMP '2022-01-01 00:00:00', 0.0, 0), + (TIMESTAMP '2030-12-31 23:59:59', 9999.99, 9999999), + (TIMESTAMP '2022-01-01 00:00:00', 9999.99, 9999999), + (TIMESTAMP '2030-12-31 23:59:59', 0.0, 0), + (TIMESTAMP '2026-06-15 12:00:00', 5000.5, 5000000), + (TIMESTAMP '2023-03-03 03:03:03', 9999.99, 0), + (TIMESTAMP '2023-03-03 03:03:03', 0.0, 9999999), + (TIMESTAMP '2025-05-05 05:05:05', 250.25, 54321), + (TIMESTAMP '2027-07-07 07:07:07', 8000.8, 8888888), + (TIMESTAMP '2024-04-04 04:04:04', 1000.1, 123456); + +SELECT * FROM default.zorder_tsdl; +DROP TABLE default.zorder_tsdl; + +-- Validates z-order on CREATE via clause and via TBLPROPERTIES. +CREATE TABLE default.zorder_props(id int, text string) +STORED BY iceberg +STORED As orc +TBLPROPERTIES ("sort.order" = "zorder", "sort.columns" = "id,text"); + +INSERT INTO default.zorder_props VALUES (3, 'B'),(1, 'A'),(7, 'C'),(2, 'A'),(9, 'B'),(6, 'C'),(4, 'A'), + (10, 'C'),(5, NULL),(8, 'B'),(NULL, 'A'),(12, 'C'),(11, 'A'),(13, NULL), + (14, 'B'),(15, 'C'),(16, 'A'),(19, 'B'),(17, 'C'),(18, 'A'); + +DESCRIBE FORMATTED default.zorder_props; +SELECT * FROM default.zorder_props; +DROP TABLE default.zorder_props; diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_create_locally_zordered_table.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_create_locally_zordered_table.q.out new file mode 100644 index 000000000000..d54bf0daef87 --- /dev/null +++ b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_create_locally_zordered_table.q.out @@ -0,0 +1,608 @@ +PREHOOK: query: CREATE TABLE default.zorder_it_nulls ( + id int, + text string) +WRITE LOCALLY ZORDER by (id, text) +STORED BY iceberg +STORED As orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_it_nulls +POSTHOOK: query: CREATE TABLE default.zorder_it_nulls ( + id int, + text string) +WRITE LOCALLY ZORDER by (id, text) +STORED BY iceberg +STORED As orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_it_nulls +PREHOOK: query: DESCRIBE FORMATTED default.zorder_it_nulls +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@zorder_it_nulls +POSTHOOK: query: DESCRIBE FORMATTED default.zorder_it_nulls +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@zorder_it_nulls +# col_name data_type comment +id int +text string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: EXTERNAL_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"id\":\"true\",\"text\":\"true\"}} + EXTERNAL TRUE + bucketing_version 2 + current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"text\",\"required\":false,\"type\":\"string\"}]} + format-version 2 + iceberg.orc.files.only true +#### A masked pattern was here #### + numFiles 0 + numRows 0 + parquet.compression zstd + rawDataSize 0 + serialization.format 1 + snapshot-count 0 + sort.columns id,text + sort.order ZORDER + storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler + table_type ICEBERG + totalSize #Masked# +#### A masked pattern was here #### + uuid #Masked# + write.delete.mode merge-on-read + write.format.default orc + write.merge.mode merge-on-read + write.update.mode merge-on-read + +# Storage Information +SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe +InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat +OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat +Compressed: No +Sort Columns: [] +PREHOOK: query: EXPLAIN INSERT INTO default.zorder_it_nulls VALUES (3, "3"),(2, "2"),(4, "4"),(5, "5"),(1, "1"),(2, "3"),(3,null),(2,null),(null,"a") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@zorder_it_nulls +POSTHOOK: query: EXPLAIN INSERT INTO default.zorder_it_nulls VALUES (3, "3"),(2, "2"),(4, "4"),(5, "5"),(1, "1"),(2, "3"),(3,null),(2,null),(null,"a") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@zorder_it_nulls +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: array(const struct(3,'3'),const struct(2,'2'),const struct(4,'4'),const struct(5,'5'),const struct(1,'1'),const struct(2,'3'),const struct(3,null),const struct(2,null),const struct(null,'a')) (type: array>) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE + UDTF Operator + Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE + function name: inline + Select Operator + expressions: col1 (type: int), col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: iceberg_zorder(_col0, _col1) (type: binary) + null sort order: a + sort order: + + Map-reduce partition columns: iceberg_zorder(_col0, _col1) (type: binary) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: string) + Select Operator + expressions: _col0 (type: int), _col1 (type: string) + outputColumnNames: id, text + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(id), max(id), count(1), count(id), compute_bit_vector_hll(id), max(length(text)), avg(COALESCE(length(text),0)), count(text), compute_bit_vector_hll(text) + minReductionHashAggr: 0.4 + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 400 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 400 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: struct), _col7 (type: bigint), _col8 (type: binary) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), VALUE._col1 (type: string), KEY.iceberg_zorder(_col0, _col1) (type: binary) + outputColumnNames: _col0, _col1, iceberg_zorder(_col0, _col1) + File Output Operator + compressed: false + Dp Sort State: PARTITION_SORTED + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat + output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat + serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe + name: default.zorder_it_nulls + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7), compute_bit_vector_hll(VALUE._col8) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 332 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'LONG' (type: string), UDFToLong(_col0) (type: bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint), COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 530 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 530 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: false + table: + input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat + output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat + serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe + name: default.zorder_it_nulls + + Stage: Stage-3 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: id, text + Column Types: int, string + Table: default.zorder_it_nulls + +PREHOOK: query: INSERT INTO default.zorder_it_nulls VALUES + (3, "3"), + (2, "2"), + (4, "4"), + (5, "5"), + (1, "1"), + (2, "3"), + (3,null), + (2,null), + (null,"a") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@zorder_it_nulls +POSTHOOK: query: INSERT INTO default.zorder_it_nulls VALUES + (3, "3"), + (2, "2"), + (4, "4"), + (5, "5"), + (1, "1"), + (2, "3"), + (3,null), + (2,null), + (null,"a") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@zorder_it_nulls +PREHOOK: query: SELECT * FROM default.zorder_it_nulls +PREHOOK: type: QUERY +PREHOOK: Input: default@zorder_it_nulls +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.zorder_it_nulls +POSTHOOK: type: QUERY +POSTHOOK: Input: default@zorder_it_nulls +#### A masked pattern was here #### +NULL a +2 NULL +3 NULL +1 1 +2 2 +2 3 +3 3 +4 4 +5 5 +PREHOOK: query: DROP TABLE default.zorder_it_nulls +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@zorder_it_nulls +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_it_nulls +POSTHOOK: query: DROP TABLE default.zorder_it_nulls +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@zorder_it_nulls +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_it_nulls +PREHOOK: query: CREATE TABLE default.zorder_dit ( + id int, + text string, + bool_val boolean, + date_val date) +WRITE LOCALLY ZORDER by (date_val, id, text) +STORED BY iceberg +STORED As orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_dit +POSTHOOK: query: CREATE TABLE default.zorder_dit ( + id int, + text string, + bool_val boolean, + date_val date) +WRITE LOCALLY ZORDER by (date_val, id, text) +STORED BY iceberg +STORED As orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_dit +PREHOOK: query: DESCRIBE FORMATTED default.zorder_dit +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@zorder_dit +POSTHOOK: query: DESCRIBE FORMATTED default.zorder_dit +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@zorder_dit +# col_name data_type comment +id int +text string +bool_val boolean +date_val date + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: EXTERNAL_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"bool_val\":\"true\",\"date_val\":\"true\",\"id\":\"true\",\"text\":\"true\"}} + EXTERNAL TRUE + bucketing_version 2 + current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"text\",\"required\":false,\"type\":\"string\"},{\"id\":3,\"name\":\"bool_val\",\"required\":false,\"type\":\"boolean\"},{\"id\":4,\"name\":\"date_val\",\"required\":false,\"type\":\"date\"}]} + format-version 2 + iceberg.orc.files.only true +#### A masked pattern was here #### + numFiles 0 + numRows 0 + parquet.compression zstd + rawDataSize 0 + serialization.format 1 + snapshot-count 0 + sort.columns date_val,id,text + sort.order ZORDER + storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler + table_type ICEBERG + totalSize #Masked# +#### A masked pattern was here #### + uuid #Masked# + write.delete.mode merge-on-read + write.format.default orc + write.merge.mode merge-on-read + write.update.mode merge-on-read + +# Storage Information +SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe +InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat +OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat +Compressed: No +Sort Columns: [] +PREHOOK: query: INSERT INTO default.zorder_dit VALUES + (0, 'a', false, DATE '2023-01-01'), + (255, 'z', true, DATE '2025-12-31'), + (0, 'z', false, DATE '2025-12-31'), + (255, 'a', true, DATE '2023-01-01'), + (128, 'm', true, DATE '2024-06-01'), + (64, 'c', false, DATE '2023-06-01'), + (192, 'x', true, DATE '2025-01-01'), + (32, 'b', true, DATE '2023-03-01'), + (96, 'd', false, DATE '2023-09-01'), + (160, 'v', true, DATE '2024-09-01') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@zorder_dit +POSTHOOK: query: INSERT INTO default.zorder_dit VALUES + (0, 'a', false, DATE '2023-01-01'), + (255, 'z', true, DATE '2025-12-31'), + (0, 'z', false, DATE '2025-12-31'), + (255, 'a', true, DATE '2023-01-01'), + (128, 'm', true, DATE '2024-06-01'), + (64, 'c', false, DATE '2023-06-01'), + (192, 'x', true, DATE '2025-01-01'), + (32, 'b', true, DATE '2023-03-01'), + (96, 'd', false, DATE '2023-09-01'), + (160, 'v', true, DATE '2024-09-01') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@zorder_dit +PREHOOK: query: SELECT * FROM default.zorder_dit +PREHOOK: type: QUERY +PREHOOK: Input: default@zorder_dit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.zorder_dit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@zorder_dit +#### A masked pattern was here #### +0 a false 2023-01-01 +255 a true 2023-01-01 +32 b true 2023-03-01 +64 c false 2023-06-01 +96 d false 2023-09-01 +128 m true 2024-06-01 +160 v true 2024-09-01 +192 x true 2025-01-01 +0 z false 2025-12-31 +255 z true 2025-12-31 +PREHOOK: query: DROP TABLE default.zorder_dit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@zorder_dit +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_dit +POSTHOOK: query: DROP TABLE default.zorder_dit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@zorder_dit +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_dit +PREHOOK: query: CREATE TABLE default.zorder_tsdl ( + ts timestamp, + dd double, + ll bigint) +WRITE LOCALLY ZORDER by (ts, dd, ll) +STORED BY iceberg +STORED As orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_tsdl +POSTHOOK: query: CREATE TABLE default.zorder_tsdl ( + ts timestamp, + dd double, + ll bigint) +WRITE LOCALLY ZORDER by (ts, dd, ll) +STORED BY iceberg +STORED As orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_tsdl +PREHOOK: query: DESCRIBE FORMATTED default.zorder_tsdl +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@zorder_tsdl +POSTHOOK: query: DESCRIBE FORMATTED default.zorder_tsdl +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@zorder_tsdl +# col_name data_type comment +ts timestamp +dd double +ll bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: EXTERNAL_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"dd\":\"true\",\"ll\":\"true\",\"ts\":\"true\"}} + EXTERNAL TRUE + bucketing_version 2 + current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"ts\",\"required\":false,\"type\":\"timestamp\"},{\"id\":2,\"name\":\"dd\",\"required\":false,\"type\":\"double\"},{\"id\":3,\"name\":\"ll\",\"required\":false,\"type\":\"long\"}]} + format-version 2 + iceberg.orc.files.only true +#### A masked pattern was here #### + numFiles 0 + numRows 0 + parquet.compression zstd + rawDataSize 0 + serialization.format 1 + snapshot-count 0 + sort.columns ts,dd,ll + sort.order ZORDER + storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler + table_type ICEBERG + totalSize #Masked# +#### A masked pattern was here #### + uuid #Masked# + write.delete.mode merge-on-read + write.format.default orc + write.merge.mode merge-on-read + write.update.mode merge-on-read + +# Storage Information +SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe +InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat +OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat +Compressed: No +Sort Columns: [] +PREHOOK: query: INSERT INTO default.zorder_tsdl VALUES + (TIMESTAMP '2022-01-01 00:00:00', 0.0, 0), + (TIMESTAMP '2030-12-31 23:59:59', 9999.99, 9999999), + (TIMESTAMP '2022-01-01 00:00:00', 9999.99, 9999999), + (TIMESTAMP '2030-12-31 23:59:59', 0.0, 0), + (TIMESTAMP '2026-06-15 12:00:00', 5000.5, 5000000), + (TIMESTAMP '2023-03-03 03:03:03', 9999.99, 0), + (TIMESTAMP '2023-03-03 03:03:03', 0.0, 9999999), + (TIMESTAMP '2025-05-05 05:05:05', 250.25, 54321), + (TIMESTAMP '2027-07-07 07:07:07', 8000.8, 8888888), + (TIMESTAMP '2024-04-04 04:04:04', 1000.1, 123456) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@zorder_tsdl +POSTHOOK: query: INSERT INTO default.zorder_tsdl VALUES + (TIMESTAMP '2022-01-01 00:00:00', 0.0, 0), + (TIMESTAMP '2030-12-31 23:59:59', 9999.99, 9999999), + (TIMESTAMP '2022-01-01 00:00:00', 9999.99, 9999999), + (TIMESTAMP '2030-12-31 23:59:59', 0.0, 0), + (TIMESTAMP '2026-06-15 12:00:00', 5000.5, 5000000), + (TIMESTAMP '2023-03-03 03:03:03', 9999.99, 0), + (TIMESTAMP '2023-03-03 03:03:03', 0.0, 9999999), + (TIMESTAMP '2025-05-05 05:05:05', 250.25, 54321), + (TIMESTAMP '2027-07-07 07:07:07', 8000.8, 8888888), + (TIMESTAMP '2024-04-04 04:04:04', 1000.1, 123456) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@zorder_tsdl +PREHOOK: query: SELECT * FROM default.zorder_tsdl +PREHOOK: type: QUERY +PREHOOK: Input: default@zorder_tsdl +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.zorder_tsdl +POSTHOOK: type: QUERY +POSTHOOK: Input: default@zorder_tsdl +#### A masked pattern was here #### +2022-01-01 00:00:00 0.0 0 +2023-03-03 03:03:03 0.0 9999999 +2030-12-31 23:59:59 0.0 0 +2025-05-05 05:05:05 250.25 54321 +2024-04-04 04:04:04 1000.1 123456 +2026-06-15 12:00:00 5000.5 5000000 +2027-07-07 07:07:07 8000.8 8888888 +2022-01-01 00:00:00 9999.99 9999999 +2023-03-03 03:03:03 9999.99 0 +2030-12-31 23:59:59 9999.99 9999999 +PREHOOK: query: DROP TABLE default.zorder_tsdl +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@zorder_tsdl +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_tsdl +POSTHOOK: query: DROP TABLE default.zorder_tsdl +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@zorder_tsdl +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_tsdl +PREHOOK: query: CREATE TABLE default.zorder_props(id int, text string) +STORED BY iceberg +STORED As orc +TBLPROPERTIES ("sort.order" = "zorder", "sort.columns" = "id,text") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_props +POSTHOOK: query: CREATE TABLE default.zorder_props(id int, text string) +STORED BY iceberg +STORED As orc +TBLPROPERTIES ("sort.order" = "zorder", "sort.columns" = "id,text") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_props +PREHOOK: query: INSERT INTO default.zorder_props VALUES (3, 'B'),(1, 'A'),(7, 'C'),(2, 'A'),(9, 'B'),(6, 'C'),(4, 'A'), + (10, 'C'),(5, NULL),(8, 'B'),(NULL, 'A'),(12, 'C'),(11, 'A'),(13, NULL), + (14, 'B'),(15, 'C'),(16, 'A'),(19, 'B'),(17, 'C'),(18, 'A') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@zorder_props +POSTHOOK: query: INSERT INTO default.zorder_props VALUES (3, 'B'),(1, 'A'),(7, 'C'),(2, 'A'),(9, 'B'),(6, 'C'),(4, 'A'), + (10, 'C'),(5, NULL),(8, 'B'),(NULL, 'A'),(12, 'C'),(11, 'A'),(13, NULL), + (14, 'B'),(15, 'C'),(16, 'A'),(19, 'B'),(17, 'C'),(18, 'A') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@zorder_props +PREHOOK: query: DESCRIBE FORMATTED default.zorder_props +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@zorder_props +POSTHOOK: query: DESCRIBE FORMATTED default.zorder_props +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@zorder_props +# col_name data_type comment +id int +text string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: EXTERNAL_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"id\":\"true\",\"text\":\"true\"}} + EXTERNAL TRUE + bucketing_version 2 + current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"text\",\"required\":false,\"type\":\"string\"}]} + current-snapshot-id #Masked# + current-snapshot-summary {\"added-data-files\":\"1\",\"added-records\":\"20\",\"added-files-size\":\"#Masked#\",\"changed-partition-count\":\"1\",\"total-records\":\"20\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"1\",\"total-delete-files\":\"0\",\"total-position-deletes\":\"0\",\"total-equality-deletes\":\"0\",\"iceberg-version\":\"#Masked#\"} + current-snapshot-timestamp-ms #Masked# + format-version 2 + iceberg.orc.files.only true +#### A masked pattern was here #### + numFiles 1 + numRows 20 + parquet.compression zstd +#### A masked pattern was here #### + rawDataSize 0 + serialization.format 1 + snapshot-count 1 + sort.columns id,text + sort.order zorder + storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler + table_type ICEBERG + totalSize #Masked# +#### A masked pattern was here #### + uuid #Masked# + write.delete.mode merge-on-read + write.format.default orc + write.merge.mode merge-on-read + write.update.mode merge-on-read + +# Storage Information +SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe +InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat +OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat +Compressed: No +Sort Columns: [] +PREHOOK: query: SELECT * FROM default.zorder_props +PREHOOK: type: QUERY +PREHOOK: Input: default@zorder_props +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.zorder_props +POSTHOOK: type: QUERY +POSTHOOK: Input: default@zorder_props +#### A masked pattern was here #### +NULL A +5 NULL +13 NULL +1 A +2 A +4 A +11 A +16 A +18 A +3 B +8 B +9 B +14 B +19 B +6 C +7 C +10 C +12 C +15 C +17 C +PREHOOK: query: DROP TABLE default.zorder_props +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@zorder_props +PREHOOK: Output: database:default +PREHOOK: Output: default@zorder_props +POSTHOOK: query: DROP TABLE default.zorder_props +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@zorder_props +POSTHOOK: Output: database:default +POSTHOOK: Output: default@zorder_props diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index d253a2f93aa4..1d0e8e9d4847 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -426,6 +426,7 @@ iceberg.llap.query.files=\ iceberg_bucket_map_join_8.q,\ iceberg_clustered.q,\ iceberg_create_locally_ordered_table.q,\ + iceberg_create_locally_zordered_table.q,\ iceberg_merge_delete_files.q,\ iceberg_merge_files.q,\ llap_iceberg_read_orc.q,\ @@ -470,6 +471,7 @@ iceberg.llap.only.query.files=\ iceberg_bucket_map_join_8.q,\ iceberg_clustered.q,\ iceberg_create_locally_ordered_table.q,\ + iceberg_create_locally_zordered_table.q,\ iceberg_merge_delete_files.q,\ iceberg_merge_files.q,\ llap_iceberg_read_orc.q,\ diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g index 70b20472ff8e..1c78a50edf9c 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g @@ -49,6 +49,11 @@ likeTableOrFile | (KW_LIKE likeName=tableName) -> ^(TOK_LIKETABLE $likeName) ; +tableOrderOption + : tableWriteLocallyOrdered + | tableWriteLocallyZorder + ; + //----------------------- Rules for parsing createtable ----------------------------- createTableStatement @init { gParent.pushMsg("create table statement", state); } @@ -64,7 +69,7 @@ createTableStatement tableComment? createTablePartitionSpec? tableBuckets? - tableWriteLocallyOrdered? + tableOrderOption? tableSkewed? tableRowFormat? tableFileFormat? @@ -78,7 +83,7 @@ createTableStatement tableComment? createTablePartitionSpec? tableBuckets? - tableWriteLocallyOrdered? + tableOrderOption? tableSkewed? tableRowFormat? tableFileFormat? @@ -96,7 +101,7 @@ createTableStatement tableComment? createTablePartitionSpec? tableBuckets? - tableWriteLocallyOrdered? + tableOrderOption? tableSkewed? tableRowFormat? tableFileFormat? @@ -110,7 +115,7 @@ createTableStatement tableComment? createTablePartitionSpec? tableBuckets? - tableWriteLocallyOrdered? + tableOrderOption? tableSkewed? tableRowFormat? tableFileFormat? diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g index de06106b7b30..99ffa6cb48a1 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g @@ -37,6 +37,7 @@ KW_DESC : 'DESC'; KW_NULLS : 'NULLS'; KW_LAST : 'LAST'; KW_ORDER : 'ORDER'; +KW_ZORDER : 'ZORDER'; KW_ORDERED : 'ORDERED'; KW_LOCALLY : 'LOCALLY'; KW_GROUP : 'GROUP'; diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index ddce6aa85af6..13d7549128cf 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -524,6 +524,7 @@ TOK_AS_OF_VERSION; TOK_FROM_VERSION; TOK_AS_OF_TAG; TOK_WRITE_LOCALLY_ORDERED; +TOK_WRITE_LOCALLY_ZORDER; } @@ -568,6 +569,7 @@ import org.apache.hadoop.hive.conf.HiveConf; xlateMap.put("KW_NULLS", "NULLS"); xlateMap.put("KW_LAST", "LAST"); xlateMap.put("KW_ORDER", "ORDER"); + xlateMap.put("KW_ZORDER", "ZORDER"); xlateMap.put("KW_ORDERED", "ORDERED"); xlateMap.put("KW_LOCALLY", "LOCALLY"); xlateMap.put("KW_BY", "BY"); @@ -1881,6 +1883,14 @@ tableWriteLocallyOrdered KW_WRITE KW_LOCALLY KW_ORDERED KW_BY sortCols=columnNameOrderList -> ^(TOK_WRITE_LOCALLY_ORDERED $sortCols?) ; + +tableWriteLocallyZorder +@init { pushMsg("table zorder sort specification", state); } +@after { popMsg(state); } + : + KW_WRITE KW_LOCALLY KW_ZORDER KW_BY LPAREN sortCols=columnNameList RPAREN + -> ^(TOK_WRITE_LOCALLY_ZORDER $sortCols?) + ; tableSkewed @init { pushMsg("table skewed specification", state); } diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g index 54ec367a677d..6a22542429ed 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g @@ -1040,6 +1040,7 @@ nonReserved | KW_TIMESTAMPLOCALTZ | KW_ORDERED | KW_LOCALLY + | KW_ZORDER ; //The following SQL2011 reserved keywords are used as function name only, but not as identifiers. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/ZOrderFieldDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/ZOrderFieldDesc.java new file mode 100644 index 000000000000..1f2afd4d7ad7 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/ZOrderFieldDesc.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.ddl.misc.sortoder; + +public class ZOrderFieldDesc { + + private String columnName; + + public ZOrderFieldDesc() { + } + + public ZOrderFieldDesc(String columnName) { + this.columnName = columnName; + } + + public String getColumnName() { + return columnName; + } + + public void setColumnName(String columnName) { + this.columnName = columnName; + } + + @Override + public String toString() { + return String.format("{columnName:%s}", columnName); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/ZOrderFields.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/ZOrderFields.java new file mode 100644 index 000000000000..023736d9308d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/misc/sortoder/ZOrderFields.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.ddl.misc.sortoder; + +import java.util.LinkedList; +import java.util.List; + +public class ZOrderFields { + + private List zOrderFieldDescs; + + public ZOrderFields() { + this.zOrderFieldDescs = new LinkedList<>(); + } + + public ZOrderFields(List zOrderFields) { + if (zOrderFields == null) { + this.zOrderFieldDescs = new LinkedList<>(); + } else { + this.zOrderFieldDescs = zOrderFields; + } + } + + public List getZOrderFields() { + return zOrderFieldDescs; + } + + public void setZOrderFields(List zOrderFields) { + this.zOrderFieldDescs = zOrderFields; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableAnalyzer.java index 37858b8af0c8..c6188df71aa5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableAnalyzer.java @@ -55,6 +55,8 @@ import org.apache.hadoop.hive.ql.ddl.DDLWork; import org.apache.hadoop.hive.ql.ddl.misc.sortoder.SortFieldDesc; import org.apache.hadoop.hive.ql.ddl.misc.sortoder.SortFields; +import org.apache.hadoop.hive.ql.ddl.misc.sortoder.ZOrderFieldDesc; +import org.apache.hadoop.hive.ql.ddl.misc.sortoder.ZOrderFields; import org.apache.hadoop.hive.ql.ddl.table.constraint.ConstraintsUtils; import org.apache.hadoop.hive.ql.ddl.table.convert.AlterTableConvertOperation; import org.apache.hadoop.hive.ql.ddl.table.create.like.CreateTableLikeDesc; @@ -189,6 +191,29 @@ private String getSortOrderJson(ASTNode ast) { } } + /** + * Converts AST child nodes to a JSON string of Z-order fields. + * Returns null if JSON serialization fails. + * + * @param ast AST node containing Z-order field names + * @return JSON string of Z-order fields or null on error + */ + private String getZOrderJson(ASTNode ast) { + List zOrderFieldDescs = new ArrayList<>(); + ZOrderFields zOrderFields = new ZOrderFields(zOrderFieldDescs); + for (int i = 0; i < ast.getChildCount(); i++) { + ASTNode child = (ASTNode) ast.getChild(i); + String name = unescapeIdentifier(child.getText()).toLowerCase(); + zOrderFieldDescs.add(new ZOrderFieldDesc(name)); + } + try { + return JSON_OBJECT_MAPPER.writer().writeValueAsString(zOrderFields); + } catch (JsonProcessingException e) { + LOG.warn("Can not create z-order json. ", e); + return null; + } + } + /** * This api is used to determine where to create acid tables are not. * if the default table type is set to external, then create transactional table should result in acid tables, @@ -495,6 +520,9 @@ ASTNode analyzeCreateTable(ASTNode ast, QB qb, PlannerContext plannerCtx) case HiveParser.TOK_WRITE_LOCALLY_ORDERED: sortOrder = getSortOrderJson((ASTNode) child.getChild(0)); break; + case HiveParser.TOK_WRITE_LOCALLY_ZORDER: + sortOrder = getZOrderJson((ASTNode) child.getChild(0)); + break; case HiveParser.TOK_TABLEROWFORMAT: rowFormatParams.analyzeRowFormat(child); break; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 7991abc8433a..5f9589ccb106 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -804,6 +804,8 @@ public final class FunctionRegistry { (Class) Class.forName("org.apache.iceberg.mr.hive.udf.GenericUDFIcebergDay")); system.registerGenericUDF("iceberg_hour", (Class) Class.forName("org.apache.iceberg.mr.hive.udf.GenericUDFIcebergHour")); + system.registerGenericUDF("iceberg_zorder", + (Class) Class.forName("org.apache.iceberg.mr.hive.udf.GenericUDFIcebergZorder")); } catch (ClassNotFoundException e) { LOG.warn("iceberg_bucket function could not be registered"); } diff --git a/ql/src/test/results/clientpositive/llap/show_functions.q.out b/ql/src/test/results/clientpositive/llap/show_functions.q.out index cb8955e3808c..d28610a55f5e 100644 --- a/ql/src/test/results/clientpositive/llap/show_functions.q.out +++ b/ql/src/test/results/clientpositive/llap/show_functions.q.out @@ -223,6 +223,7 @@ iceberg_hour iceberg_month iceberg_truncate iceberg_year +iceberg_zorder if in in_bloom_filter @@ -862,6 +863,7 @@ iceberg_hour iceberg_month iceberg_truncate iceberg_year +iceberg_zorder if in in_bloom_filter