From d3810372380c4ecb796405651083d8efc92c4028 Mon Sep 17 00:00:00 2001 From: Costin Leau Date: Tue, 24 Mar 2026 23:37:22 +0200 Subject: [PATCH 1/7] ESQL: Fix ORC type support gaps Fix three ORC type mapping issues in the format reader: - DECIMAL columns (all precisions) crashed with QlIllegalArgumentException because DecimalColumnVector was not handled in the double block path. Add support for both DecimalColumnVector and Decimal64ColumnVector. - TIMESTAMP/TIMESTAMP_INSTANT now map to DATE_NANOS to preserve full nanosecond precision. The sub-millisecond nanos[] array was previously ignored, silently dropping up to 999,999 nanoseconds per timestamp. - BINARY columns now map to UNSUPPORTED instead of KEYWORD to avoid exposing raw bytes as garbled text. --- .../esql/datasource/orc/OrcFormatReader.java | 170 +++++++++++++- .../datasource/orc/OrcFormatReaderTests.java | 216 +++++++++++++++++- 2 files changed, 377 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java index 037543a45fd69..a5e0da6809381 100644 --- a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java +++ b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java @@ -11,6 +11,8 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -74,6 +76,7 @@ public class OrcFormatReader implements FormatReader { private static final long MILLIS_PER_DAY = Duration.ofDays(1).toMillis(); + private static final long NANOS_PER_DAY = Duration.ofDays(1).toNanos(); private final BlockFactory blockFactory; private final SearchArgument pushedFilter; @@ -117,7 +120,7 @@ public SourceMetadata metadata(StorageObject object) throws IOException { * {@code false} uses getMinimum/getMaximum which applies a local-timezone shift. Without * it, predicates against TIMESTAMP_INSTANT columns cause false stripe exclusions. *

- * This is safe because ESQL maps all date types to DATETIME using UTC epoch millis, + * This is safe because ESQL maps timestamp types to DATE_NANOS using UTC epoch nanos, * and the ORC files use TIMESTAMP_INSTANT (UTC-anchored) columns. If we ever support * files with plain TIMESTAMP columns (writer-local timezone), this flag would incorrectly * treat their statistics as UTC too — at that point we'd need per-column evaluation by @@ -325,8 +328,8 @@ private static DataType convertOrcTypeToEsql(TypeDescription orcType) { case FLOAT, DOUBLE -> DataType.DOUBLE; case STRING -> DataType.TEXT; case VARCHAR, CHAR -> DataType.KEYWORD; - case BINARY -> DataType.KEYWORD; - case TIMESTAMP, TIMESTAMP_INSTANT, DATE -> DataType.DATETIME; + case TIMESTAMP, TIMESTAMP_INSTANT -> DataType.DATE_NANOS; + case DATE -> DataType.DATETIME; case DECIMAL -> DataType.DOUBLE; case LIST -> convertOrcTypeToEsql(orcType.getChildren().get(0)); default -> DataType.UNSUPPORTED; @@ -452,6 +455,7 @@ private Block createBlock(ColumnVector vector, DataType dataType, int rowCount) case DOUBLE -> createDoubleBlock(vector, rowCount); case KEYWORD, TEXT -> createBytesRefBlock(vector, rowCount); case DATETIME -> createDatetimeBlock(vector, rowCount); + case DATE_NANOS -> createDateNanosBlock(vector, rowCount); default -> blockFactory.newConstantNullBlock(rowCount); }; } @@ -464,6 +468,7 @@ private Block createListBlock(ListColumnVector listCol, DataType elementType, in case DOUBLE -> createListDoubleBlock(listCol, rowCount); case BOOLEAN -> createListBooleanBlock(listCol, rowCount); case DATETIME -> createListDatetimeBlock(listCol, rowCount); + case DATE_NANOS -> createListDateNanosBlock(listCol, rowCount); default -> blockFactory.newConstantNullBlock(rowCount); }; } @@ -546,7 +551,7 @@ private Block createListLongBlock(ListColumnVector listCol, int rowCount) { } private Block createListDoubleBlock(ListColumnVector listCol, int rowCount) { - DoubleColumnVector child = (DoubleColumnVector) listCol.child; + ColumnVector child = listCol.child; try (var builder = blockFactory.newDoubleBlockBuilder(rowCount)) { for (int i = 0; i < rowCount; i++) { if (listCol.noNulls == false && listCol.isNull[i]) { @@ -560,7 +565,7 @@ private Block createListDoubleBlock(ListColumnVector listCol, int rowCount) { if (child.noNulls == false && child.isNull[idx]) { builder.appendDouble(0.0); } else { - builder.appendDouble(child.vector[idx]); + builder.appendDouble(readDoubleFrom(child, idx)); } } builder.endPositionEntry(); @@ -570,6 +575,17 @@ private Block createListDoubleBlock(ListColumnVector listCol, int rowCount) { } } + private static double readDoubleFrom(ColumnVector vector, int idx) { + if (vector instanceof DoubleColumnVector dv) { + return dv.vector[idx]; + } else if (vector instanceof DecimalColumnVector decV) { + return decV.vector[idx].doubleValue(); + } else if (vector instanceof Decimal64ColumnVector d64) { + return d64.vector[idx] / Math.pow(10, d64.scale); + } + throw new QlIllegalArgumentException("Unsupported list element type: " + vector.getClass().getSimpleName()); + } + private Block createListBooleanBlock(ListColumnVector listCol, int rowCount) { LongColumnVector child = (LongColumnVector) listCol.child; try (var builder = blockFactory.newBooleanBlockBuilder(rowCount)) { @@ -642,6 +658,11 @@ private Block createDoubleBlock(ColumnVector vector, int rowCount) { doubleVector.isRepeating, doubleVector.isNull ); + } else if (vector instanceof DecimalColumnVector decVector) { + return createDecimalDoubleBlock(decVector, rowCount); + } else if (vector instanceof Decimal64ColumnVector dec64Vector) { + // Decimal64ColumnVector extends LongColumnVector — must check before LongColumnVector + return createDecimal64DoubleBlock(dec64Vector, rowCount); } else if (vector instanceof LongColumnVector longVector) { return ColumnBlockConversions.doubleColumnFromLongs( blockFactory, @@ -655,6 +676,54 @@ private Block createDoubleBlock(ColumnVector vector, int rowCount) { throw new QlIllegalArgumentException("Unsupported column type: " + vector.getClass().getSimpleName()); } + /** + * Converts a {@link DecimalColumnVector} (arbitrary precision) to a double block. + * Each element is a {@code HiveDecimalWritable} whose {@code doubleValue()} returns the + * properly scaled value. Precision loss beyond ~15 significant digits is inherent to double. + */ + private Block createDecimalDoubleBlock(DecimalColumnVector decVector, int rowCount) { + if (decVector.isRepeating) { + if (decVector.noNulls == false && decVector.isNull[0]) { + return blockFactory.newConstantNullBlock(rowCount); + } + return blockFactory.newConstantDoubleBlockWith(decVector.vector[0].doubleValue(), rowCount); + } + try (var builder = blockFactory.newDoubleBlockBuilder(rowCount)) { + for (int i = 0; i < rowCount; i++) { + if (decVector.noNulls == false && decVector.isNull[i]) { + builder.appendNull(); + } else { + builder.appendDouble(decVector.vector[i].doubleValue()); + } + } + return builder.build(); + } + } + + /** + * Converts a {@link Decimal64ColumnVector} (precision ≤ 18) to a double block. + * Values are stored as unscaled longs; dividing by 10^scale recovers the decimal value. + */ + private Block createDecimal64DoubleBlock(Decimal64ColumnVector dec64Vector, int rowCount) { + double scaleFactor = Math.pow(10, dec64Vector.scale); + if (dec64Vector.isRepeating) { + if (dec64Vector.noNulls == false && dec64Vector.isNull[0]) { + return blockFactory.newConstantNullBlock(rowCount); + } + return blockFactory.newConstantDoubleBlockWith(dec64Vector.vector[0] / scaleFactor, rowCount); + } + try (var builder = blockFactory.newDoubleBlockBuilder(rowCount)) { + for (int i = 0; i < rowCount; i++) { + if (dec64Vector.noNulls == false && dec64Vector.isNull[i]) { + builder.appendNull(); + } else { + builder.appendDouble(dec64Vector.vector[i] / scaleFactor); + } + } + return builder.build(); + } + } + private Block createBytesRefBlock(ColumnVector vector, int rowCount) { Check.isTrue(vector instanceof BytesColumnVector, "Unsupported column type: " + vector.getClass().getSimpleName()); BytesColumnVector bytesVector = (BytesColumnVector) vector; @@ -728,6 +797,97 @@ private Block createDatetimeBlock(ColumnVector vector, int rowCount) { return blockFactory.newConstantNullBlock(rowCount); } + /** + * Converts a {@link TimestampColumnVector} to a DATE_NANOS block (epoch nanoseconds). + * ORC stores timestamps as ({@code time[]} millis, {@code nanos[]} nanos-within-second). + * We combine them: {@code floorDiv(millis, 1000) * 1_000_000_000 + nanos}. + * {@code floorDiv} is required for correct rounding of pre-epoch timestamps. + */ + private Block createDateNanosBlock(ColumnVector vector, int rowCount) { + if (vector instanceof TimestampColumnVector tsVector) { + if (tsVector.isRepeating) { + if (tsVector.noNulls == false && tsVector.isNull[0]) { + return blockFactory.newConstantNullBlock(rowCount); + } + long epochNanos = Math.floorDiv(tsVector.time[0], 1000L) * 1_000_000_000L + tsVector.nanos[0]; + return blockFactory.newConstantLongBlockWith(epochNanos, rowCount); + } + long[] nanos = new long[rowCount]; + for (int i = 0; i < rowCount; i++) { + nanos[i] = Math.floorDiv(tsVector.time[i], 1000L) * 1_000_000_000L + tsVector.nanos[i]; + } + if (tsVector.noNulls) { + return blockFactory.newLongArrayVector(nanos, rowCount).asBlock(); + } + return blockFactory.newLongArrayBlock( + nanos, + rowCount, + null, + toBitSet(tsVector.isNull, rowCount), + Block.MvOrdering.UNORDERED + ); + } else if (vector instanceof LongColumnVector longVector && vector instanceof Decimal64ColumnVector == false) { + if (longVector.isRepeating) { + if (longVector.noNulls == false && longVector.isNull[0]) { + return blockFactory.newConstantNullBlock(rowCount); + } + return blockFactory.newConstantLongBlockWith(longVector.vector[0] * NANOS_PER_DAY, rowCount); + } + long[] nanos = new long[rowCount]; + for (int i = 0; i < rowCount; i++) { + nanos[i] = longVector.vector[i] * NANOS_PER_DAY; + } + if (longVector.noNulls) { + return blockFactory.newLongArrayVector(nanos, rowCount).asBlock(); + } + return blockFactory.newLongArrayBlock( + nanos, + rowCount, + null, + toBitSet(longVector.isNull, rowCount), + Block.MvOrdering.UNORDERED + ); + } + return blockFactory.newConstantNullBlock(rowCount); + } + + private Block createListDateNanosBlock(ListColumnVector listCol, int rowCount) { + ColumnVector child = listCol.child; + try (var builder = blockFactory.newLongBlockBuilder(rowCount)) { + for (int i = 0; i < rowCount; i++) { + if (listCol.noNulls == false && listCol.isNull[i]) { + builder.appendNull(); + } else { + int start = (int) listCol.offsets[i]; + int len = (int) listCol.lengths[i]; + builder.beginPositionEntry(); + for (int j = 0; j < len; j++) { + int idx = start + j; + long nanos; + if (child instanceof TimestampColumnVector ts) { + if (ts.noNulls == false && ts.isNull[idx]) { + nanos = 0L; + } else { + nanos = Math.floorDiv(ts.time[idx], 1000L) * 1_000_000_000L + ts.nanos[idx]; + } + } else if (child instanceof LongColumnVector lv) { + if (lv.noNulls == false && lv.isNull[idx]) { + nanos = 0L; + } else { + nanos = lv.vector[idx] * NANOS_PER_DAY; + } + } else { + nanos = 0L; + } + builder.appendLong(nanos); + } + builder.endPositionEntry(); + } + } + return builder.build(); + } + } + private static BitSet toBitSet(boolean[] isNull, int length) { BitSet bits = new BitSet(length); for (int i = 0; i < length; i++) { diff --git a/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java b/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java index b84ca6176ef53..75a6e353364d9 100644 --- a/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java +++ b/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java @@ -12,6 +12,7 @@ import org.apache.hadoop.fs.RawLocalFileSystem; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -20,6 +21,7 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.lucene.util.BytesRef; import org.apache.orc.CompressionKind; import org.apache.orc.OrcFile; @@ -490,6 +492,9 @@ public void testReadTimestampColumn() throws Exception { StorageObject storageObject = createStorageObject(orcData); OrcFormatReader reader = new OrcFormatReader(blockFactory); + SourceMetadata metadata = reader.metadata(storageObject); + assertEquals(DataType.DATE_NANOS, metadata.schema().get(1).dataType()); + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { assertTrue(iterator.hasNext()); Page page = iterator.next(); @@ -497,8 +502,8 @@ public void testReadTimestampColumn() throws Exception { assertEquals(2, page.getPositionCount()); LongBlock tsBlock = (LongBlock) page.getBlock(1); - assertEquals(epochMillis, tsBlock.getLong(0)); - assertEquals(epochMillis + 3600_000, tsBlock.getLong(1)); + assertEquals(epochMillis * 1_000_000L, tsBlock.getLong(0)); + assertEquals((epochMillis + 3600_000) * 1_000_000L, tsBlock.getLong(1)); } } @@ -537,9 +542,9 @@ public void testReadTimestampColumnWithNulls() throws Exception { assertEquals(3, page.getPositionCount()); LongBlock tsBlock = (LongBlock) page.getBlock(1); - assertEquals(epochMillis, tsBlock.getLong(0)); + assertEquals(epochMillis * 1_000_000L, tsBlock.getLong(0)); assertTrue(tsBlock.isNull(1)); - assertEquals(epochMillis + 7200_000, tsBlock.getLong(2)); + assertEquals((epochMillis + 7200_000) * 1_000_000L, tsBlock.getLong(2)); } } @@ -760,6 +765,209 @@ public void testReadWithPushedFilterAndColumnProjection() throws Exception { } } + public void testTimestampWithSubMillisPrecision() throws Exception { + TypeDescription schema = TypeDescription.createStruct().addField("event_time", TypeDescription.createTimestampInstant()); + + long epochMillis = Instant.parse("2024-01-15T10:30:00.123Z").toEpochMilli(); + + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 2; + TimestampColumnVector tsCol = (TimestampColumnVector) batch.cols[0]; + + tsCol.time[0] = epochMillis; + tsCol.nanos[0] = 123_456_789; + + tsCol.time[1] = epochMillis; + tsCol.nanos[1] = 123_000_000; + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(2, page.getPositionCount()); + LongBlock tsBlock = (LongBlock) page.getBlock(0); + + long expectedSeconds = epochMillis / 1000; + assertEquals(expectedSeconds * 1_000_000_000L + 123_456_789L, tsBlock.getLong(0)); + assertEquals(expectedSeconds * 1_000_000_000L + 123_000_000L, tsBlock.getLong(1)); + } + } + + public void testBinaryMapsToUnsupported() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("id", TypeDescription.createLong()) + .addField("payload", TypeDescription.createBinary()); + + byte[] rawBytes = new byte[] { 0x00, 0x01, (byte) 0xFF, (byte) 0xFE, 0x42 }; + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 1; + ((LongColumnVector) batch.cols[0]).vector[0] = 1L; + ((BytesColumnVector) batch.cols[1]).setVal(0, rawBytes); + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + SourceMetadata metadata = reader.metadata(storageObject); + assertEquals(DataType.UNSUPPORTED, metadata.schema().get(1).dataType()); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(1, page.getPositionCount()); + assertEquals(1L, ((LongBlock) page.getBlock(0)).getLong(0)); + assertTrue(page.getBlock(1).isNull(0)); + } + } + + public void testReadDecimalColumn() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("id", TypeDescription.createLong()) + .addField("price", TypeDescription.createDecimal().withPrecision(10).withScale(2)); + + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 3; + LongColumnVector idCol = (LongColumnVector) batch.cols[0]; + DecimalColumnVector priceCol = (DecimalColumnVector) batch.cols[1]; + + idCol.vector[0] = 1L; + priceCol.set(0, new HiveDecimalWritable("123.45")); + + idCol.vector[1] = 2L; + priceCol.set(1, new HiveDecimalWritable("0.01")); + + idCol.vector[2] = 3L; + priceCol.set(2, new HiveDecimalWritable("99999.99")); + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + SourceMetadata metadata = reader.metadata(storageObject); + assertEquals(DataType.DOUBLE, metadata.schema().get(1).dataType()); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(3, page.getPositionCount()); + + DoubleBlock priceBlock = (DoubleBlock) page.getBlock(1); + assertEquals(123.45, priceBlock.getDouble(0), 0.001); + assertEquals(0.01, priceBlock.getDouble(1), 0.001); + assertEquals(99999.99, priceBlock.getDouble(2), 0.001); + } + } + + public void testReadDecimalColumnWithNulls() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("amount", TypeDescription.createDecimal().withPrecision(10).withScale(2)); + + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 3; + DecimalColumnVector amountCol = (DecimalColumnVector) batch.cols[0]; + + amountCol.set(0, new HiveDecimalWritable("42.50")); + + amountCol.noNulls = false; + amountCol.isNull[1] = true; + + amountCol.set(2, new HiveDecimalWritable("100.00")); + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(3, page.getPositionCount()); + + DoubleBlock block = (DoubleBlock) page.getBlock(0); + assertEquals(42.50, block.getDouble(0), 0.001); + assertTrue(block.isNull(1)); + assertEquals(100.00, block.getDouble(2), 0.001); + } + } + + public void testReadDecimalHighPrecision() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("value", TypeDescription.createDecimal().withPrecision(38).withScale(10)); + + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 2; + DecimalColumnVector valCol = (DecimalColumnVector) batch.cols[0]; + + valCol.set(0, new HiveDecimalWritable("1234567890.1234567890")); + valCol.set(1, new HiveDecimalWritable("-9876543210.0000000001")); + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(2, page.getPositionCount()); + + DoubleBlock block = (DoubleBlock) page.getBlock(0); + assertEquals(1234567890.1234567890, block.getDouble(0), 0.01); + assertEquals(-9876543210.0000000001, block.getDouble(1), 0.01); + } + } + + public void testReadListDecimalColumn() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("id", TypeDescription.createLong()) + .addField("prices", TypeDescription.createList(TypeDescription.createDecimal().withPrecision(10).withScale(2))); + + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 2; + LongColumnVector idCol = (LongColumnVector) batch.cols[0]; + ListColumnVector pricesCol = (ListColumnVector) batch.cols[1]; + DecimalColumnVector pricesChild = (DecimalColumnVector) pricesCol.child; + + pricesChild.ensureSize(3, false); + idCol.vector[0] = 1L; + pricesCol.offsets[0] = 0; + pricesCol.lengths[0] = 2; + pricesChild.set(0, new HiveDecimalWritable("10.50")); + pricesChild.set(1, new HiveDecimalWritable("20.99")); + + idCol.vector[1] = 2L; + pricesCol.offsets[1] = 2; + pricesCol.lengths[1] = 1; + pricesChild.set(2, new HiveDecimalWritable("99.00")); + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + SourceMetadata metadata = reader.metadata(storageObject); + assertEquals(DataType.DOUBLE, metadata.schema().get(1).dataType()); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(2, page.getPositionCount()); + + DoubleBlock pricesBlock = (DoubleBlock) page.getBlock(1); + assertEquals(2, pricesBlock.getValueCount(0)); + assertEquals(10.50, pricesBlock.getDouble(0), 0.001); + assertEquals(20.99, pricesBlock.getDouble(1), 0.001); + assertEquals(1, pricesBlock.getValueCount(1)); + assertEquals(99.00, pricesBlock.getDouble(2), 0.001); + } + } + @FunctionalInterface private interface BatchPopulator { void populate(VectorizedRowBatch batch); From 846c0fcc5eb9466a1ac1b3a4478b60860b1fec4f Mon Sep 17 00:00:00 2001 From: Costin Leau Date: Fri, 27 Mar 2026 11:29:40 +0200 Subject: [PATCH 2/7] Update docs/changelog/145074.yaml --- docs/changelog/145074.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/145074.yaml diff --git a/docs/changelog/145074.yaml b/docs/changelog/145074.yaml new file mode 100644 index 0000000000000..ac4a22324276e --- /dev/null +++ b/docs/changelog/145074.yaml @@ -0,0 +1,5 @@ +area: ES|QL +issues: [] +pr: 145074 +summary: Fix ORC type support gaps +type: enhancement From 52e57a7aeab3b20c10bda24570c6a0af65a6d99b Mon Sep 17 00:00:00 2001 From: Costin Leau Date: Fri, 27 Mar 2026 12:47:30 +0200 Subject: [PATCH 3/7] Fix csv-spec type assertion for DATE_NANOS columns Accept DATE_NANOS as compatible with DATETIME in test type assertions, matching the existing INTEGER/LONG coercion pattern. External source formats like ORC that preserve nanosecond precision map timestamps to DATE_NANOS, but the shared csv-spec files declare columns as :date (DATETIME). --- .../src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java index 168ddb655a469..12ed3f8f673f8 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java @@ -126,6 +126,9 @@ public static void assertMetadata( if (actualType == Type.INTEGER && expectedType == Type.LONG) { actualType = Type.LONG; } + if (actualType == Type.DATE_NANOS && expectedType == Type.DATETIME) { + actualType = Type.DATETIME; + } if (actualType == null) { actualType = Type.NULL; } From 1d8568f0c8b15999ca435190ea969e41a162e35e Mon Sep 17 00:00:00 2001 From: Costin Leau Date: Fri, 27 Mar 2026 17:52:17 +0200 Subject: [PATCH 4/7] Address review feedback - Add explanatory comment for DATE_NANOS/DATETIME coercion in CsvAssert, clarifying the rationale. - Replace silent else catch-alls with QlIllegalArgumentException in createListDateNanosBlock and createListDatetimeBlock, consistent with createDoubleBlock. --- .../xpack/esql/datasource/orc/OrcFormatReader.java | 8 ++++++-- .../main/java/org/elasticsearch/xpack/esql/CsvAssert.java | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java index e7196a20f932b..43f7513a604a2 100644 --- a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java +++ b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java @@ -643,7 +643,9 @@ private Block createListDatetimeBlock(ListColumnVector listCol, int rowCount) { millis = lv.vector[idx] * MILLIS_PER_DAY; } } else { - millis = 0L; + throw new QlIllegalArgumentException( + "Unsupported list child type for DATETIME: " + child.getClass().getSimpleName() + ); } builder.appendLong(millis); } @@ -883,7 +885,9 @@ private Block createListDateNanosBlock(ListColumnVector listCol, int rowCount) { nanos = lv.vector[idx] * NANOS_PER_DAY; } } else { - nanos = 0L; + throw new QlIllegalArgumentException( + "Unsupported list child type for DATE_NANOS: " + child.getClass().getSimpleName() + ); } builder.appendLong(nanos); } diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java index 12ed3f8f673f8..01f2789093f08 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java @@ -126,6 +126,8 @@ public static void assertMetadata( if (actualType == Type.INTEGER && expectedType == Type.LONG) { actualType = Type.LONG; } + // DATE_NANOS is a higher-precision form of DATETIME; external source formats (e.g. ORC) that preserve + // nanosecond precision return DATE_NANOS, but the shared csv-spec files declare columns as :date (DATETIME). if (actualType == Type.DATE_NANOS && expectedType == Type.DATETIME) { actualType = Type.DATETIME; } From c188932e4b2fc0d68f72cb67bcb0990a194808cc Mon Sep 17 00:00:00 2001 From: Costin Leau Date: Mon, 30 Mar 2026 16:56:27 +0300 Subject: [PATCH 5/7] Revert TIMESTAMP to DATETIME, remove DATE_NANOS path DATE_NANOS cannot represent pre-epoch dates (assert nanos >= 0 in DateUtils.compareNanosToMillis), which crashes on ORC files with historical timestamps. Revert to DATETIME (millis precision) until DATE_NANOS range is extended. --- .../esql/datasource/orc/OrcFormatReader.java | 108 +----------------- .../datasource/orc/OrcFormatReaderTests.java | 17 ++- .../elasticsearch/xpack/esql/CsvAssert.java | 5 - 3 files changed, 14 insertions(+), 116 deletions(-) diff --git a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java index 43f7513a604a2..b3e62eec44b53 100644 --- a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java +++ b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java @@ -77,7 +77,6 @@ public class OrcFormatReader implements FormatReader { private static final long MILLIS_PER_DAY = Duration.ofDays(1).toMillis(); - private static final long NANOS_PER_DAY = Duration.ofDays(1).toNanos(); private final BlockFactory blockFactory; private final SearchArgument pushedFilter; @@ -121,11 +120,11 @@ public SourceMetadata metadata(StorageObject object) throws IOException { * {@code false} uses getMinimum/getMaximum which applies a local-timezone shift. Without * it, predicates against TIMESTAMP_INSTANT columns cause false stripe exclusions. *

- * This is safe because ESQL maps timestamp types to DATE_NANOS using UTC epoch nanos, - * and the ORC files use TIMESTAMP_INSTANT (UTC-anchored) columns. If we ever support - * files with plain TIMESTAMP columns (writer-local timezone), this flag would incorrectly - * treat their statistics as UTC too — at that point we'd need per-column evaluation by - * bypassing SearchArgument and reading stripe statistics directly (the Trino approach). + * This is safe because the ORC files use TIMESTAMP_INSTANT (UTC-anchored) columns. If we + * ever support files with plain TIMESTAMP columns (writer-local timezone), this flag would + * incorrectly treat their statistics as UTC too — at that point we'd need per-column + * evaluation by bypassing SearchArgument and reading stripe statistics directly (the Trino + * approach). */ private static OrcFile.ReaderOptions orcReaderOptions(OrcStorageObjectAdapter fs) { return OrcFile.readerOptions(new Configuration(false)).filesystem(fs).useUTCTimestamp(true); @@ -334,7 +333,7 @@ private static DataType convertOrcTypeToEsql(TypeDescription orcType) { case FLOAT, DOUBLE -> DataType.DOUBLE; case STRING -> DataType.TEXT; case VARCHAR, CHAR -> DataType.KEYWORD; - case TIMESTAMP, TIMESTAMP_INSTANT -> DataType.DATE_NANOS; + case TIMESTAMP, TIMESTAMP_INSTANT -> DataType.DATETIME; case DATE -> DataType.DATETIME; case DECIMAL -> DataType.DOUBLE; case LIST -> convertOrcTypeToEsql(orcType.getChildren().get(0)); @@ -461,7 +460,6 @@ private Block createBlock(ColumnVector vector, DataType dataType, int rowCount) case DOUBLE -> createDoubleBlock(vector, rowCount); case KEYWORD, TEXT -> createBytesRefBlock(vector, rowCount); case DATETIME -> createDatetimeBlock(vector, rowCount); - case DATE_NANOS -> createDateNanosBlock(vector, rowCount); default -> blockFactory.newConstantNullBlock(rowCount); }; } @@ -474,7 +472,6 @@ private Block createListBlock(ListColumnVector listCol, DataType elementType, in case DOUBLE -> createListDoubleBlock(listCol, rowCount); case BOOLEAN -> createListBooleanBlock(listCol, rowCount); case DATETIME -> createListDatetimeBlock(listCol, rowCount); - case DATE_NANOS -> createListDateNanosBlock(listCol, rowCount); default -> blockFactory.newConstantNullBlock(rowCount); }; } @@ -805,99 +802,6 @@ private Block createDatetimeBlock(ColumnVector vector, int rowCount) { return blockFactory.newConstantNullBlock(rowCount); } - /** - * Converts a {@link TimestampColumnVector} to a DATE_NANOS block (epoch nanoseconds). - * ORC stores timestamps as ({@code time[]} millis, {@code nanos[]} nanos-within-second). - * We combine them: {@code floorDiv(millis, 1000) * 1_000_000_000 + nanos}. - * {@code floorDiv} is required for correct rounding of pre-epoch timestamps. - */ - private Block createDateNanosBlock(ColumnVector vector, int rowCount) { - if (vector instanceof TimestampColumnVector tsVector) { - if (tsVector.isRepeating) { - if (tsVector.noNulls == false && tsVector.isNull[0]) { - return blockFactory.newConstantNullBlock(rowCount); - } - long epochNanos = Math.floorDiv(tsVector.time[0], 1000L) * 1_000_000_000L + tsVector.nanos[0]; - return blockFactory.newConstantLongBlockWith(epochNanos, rowCount); - } - long[] nanos = new long[rowCount]; - for (int i = 0; i < rowCount; i++) { - nanos[i] = Math.floorDiv(tsVector.time[i], 1000L) * 1_000_000_000L + tsVector.nanos[i]; - } - if (tsVector.noNulls) { - return blockFactory.newLongArrayVector(nanos, rowCount).asBlock(); - } - return blockFactory.newLongArrayBlock( - nanos, - rowCount, - null, - toBitSet(tsVector.isNull, rowCount), - Block.MvOrdering.UNORDERED - ); - } else if (vector instanceof LongColumnVector longVector && vector instanceof Decimal64ColumnVector == false) { - if (longVector.isRepeating) { - if (longVector.noNulls == false && longVector.isNull[0]) { - return blockFactory.newConstantNullBlock(rowCount); - } - return blockFactory.newConstantLongBlockWith(longVector.vector[0] * NANOS_PER_DAY, rowCount); - } - long[] nanos = new long[rowCount]; - for (int i = 0; i < rowCount; i++) { - nanos[i] = longVector.vector[i] * NANOS_PER_DAY; - } - if (longVector.noNulls) { - return blockFactory.newLongArrayVector(nanos, rowCount).asBlock(); - } - return blockFactory.newLongArrayBlock( - nanos, - rowCount, - null, - toBitSet(longVector.isNull, rowCount), - Block.MvOrdering.UNORDERED - ); - } - return blockFactory.newConstantNullBlock(rowCount); - } - - private Block createListDateNanosBlock(ListColumnVector listCol, int rowCount) { - ColumnVector child = listCol.child; - try (var builder = blockFactory.newLongBlockBuilder(rowCount)) { - for (int i = 0; i < rowCount; i++) { - if (listCol.noNulls == false && listCol.isNull[i]) { - builder.appendNull(); - } else { - int start = (int) listCol.offsets[i]; - int len = (int) listCol.lengths[i]; - builder.beginPositionEntry(); - for (int j = 0; j < len; j++) { - int idx = start + j; - long nanos; - if (child instanceof TimestampColumnVector ts) { - if (ts.noNulls == false && ts.isNull[idx]) { - nanos = 0L; - } else { - nanos = Math.floorDiv(ts.time[idx], 1000L) * 1_000_000_000L + ts.nanos[idx]; - } - } else if (child instanceof LongColumnVector lv) { - if (lv.noNulls == false && lv.isNull[idx]) { - nanos = 0L; - } else { - nanos = lv.vector[idx] * NANOS_PER_DAY; - } - } else { - throw new QlIllegalArgumentException( - "Unsupported list child type for DATE_NANOS: " + child.getClass().getSimpleName() - ); - } - builder.appendLong(nanos); - } - builder.endPositionEntry(); - } - } - return builder.build(); - } - } - private static BitSet toBitSet(boolean[] isNull, int length) { BitSet bits = new BitSet(length); for (int i = 0; i < length; i++) { diff --git a/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java b/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java index 75a6e353364d9..c735b651cf2d0 100644 --- a/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java +++ b/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java @@ -493,7 +493,7 @@ public void testReadTimestampColumn() throws Exception { OrcFormatReader reader = new OrcFormatReader(blockFactory); SourceMetadata metadata = reader.metadata(storageObject); - assertEquals(DataType.DATE_NANOS, metadata.schema().get(1).dataType()); + assertEquals(DataType.DATETIME, metadata.schema().get(1).dataType()); try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { assertTrue(iterator.hasNext()); @@ -502,8 +502,8 @@ public void testReadTimestampColumn() throws Exception { assertEquals(2, page.getPositionCount()); LongBlock tsBlock = (LongBlock) page.getBlock(1); - assertEquals(epochMillis * 1_000_000L, tsBlock.getLong(0)); - assertEquals((epochMillis + 3600_000) * 1_000_000L, tsBlock.getLong(1)); + assertEquals(epochMillis, tsBlock.getLong(0)); + assertEquals(epochMillis + 3600_000, tsBlock.getLong(1)); } } @@ -542,9 +542,9 @@ public void testReadTimestampColumnWithNulls() throws Exception { assertEquals(3, page.getPositionCount()); LongBlock tsBlock = (LongBlock) page.getBlock(1); - assertEquals(epochMillis * 1_000_000L, tsBlock.getLong(0)); + assertEquals(epochMillis, tsBlock.getLong(0)); assertTrue(tsBlock.isNull(1)); - assertEquals((epochMillis + 7200_000) * 1_000_000L, tsBlock.getLong(2)); + assertEquals(epochMillis + 7200_000, tsBlock.getLong(2)); } } @@ -765,7 +765,7 @@ public void testReadWithPushedFilterAndColumnProjection() throws Exception { } } - public void testTimestampWithSubMillisPrecision() throws Exception { + public void testTimestampTruncatesToMillisPrecision() throws Exception { TypeDescription schema = TypeDescription.createStruct().addField("event_time", TypeDescription.createTimestampInstant()); long epochMillis = Instant.parse("2024-01-15T10:30:00.123Z").toEpochMilli(); @@ -791,9 +791,8 @@ public void testTimestampWithSubMillisPrecision() throws Exception { assertEquals(2, page.getPositionCount()); LongBlock tsBlock = (LongBlock) page.getBlock(0); - long expectedSeconds = epochMillis / 1000; - assertEquals(expectedSeconds * 1_000_000_000L + 123_456_789L, tsBlock.getLong(0)); - assertEquals(expectedSeconds * 1_000_000_000L + 123_000_000L, tsBlock.getLong(1)); + assertEquals(epochMillis, tsBlock.getLong(0)); + assertEquals(epochMillis, tsBlock.getLong(1)); } } diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java index 01f2789093f08..168ddb655a469 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java @@ -126,11 +126,6 @@ public static void assertMetadata( if (actualType == Type.INTEGER && expectedType == Type.LONG) { actualType = Type.LONG; } - // DATE_NANOS is a higher-precision form of DATETIME; external source formats (e.g. ORC) that preserve - // nanosecond precision return DATE_NANOS, but the shared csv-spec files declare columns as :date (DATETIME). - if (actualType == Type.DATE_NANOS && expectedType == Type.DATETIME) { - actualType = Type.DATETIME; - } if (actualType == null) { actualType = Type.NULL; } From daee10e1a53aff0ae2d4443105feefef3eaab420 Mon Sep 17 00:00:00 2001 From: Costin Leau Date: Mon, 30 Mar 2026 23:30:07 +0300 Subject: [PATCH 6/7] Add pre-epoch date filter regression test Adds filterByPreEpochBirthDate to the shared csv-spec that filters on a 1953 birth date, protecting against re-introducing DATE_NANOS without fixing its range. --- .../src/main/resources/external-basic.csv-spec | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/external-basic.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/external-basic.csv-spec index 1c749392f645e..612e95f705a69 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/external-basic.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/external-basic.csv-spec @@ -17,6 +17,19 @@ emp_no:integer | first_name:keyword | last_name:keyword | birth_date:date 10005 | "Kyoichi" | "Maliniak" | 1955-01-21T00:00:00.000Z | "M" | 1989-09-12T00:00:00.000Z | 1 | 2.05 | 63528 | true ; +filterByPreEpochBirthDate +required_capability: external_command + +EXTERNAL "{{employees}}" +| KEEP emp_no, first_name, last_name, birth_date, gender, hire_date, languages, height, salary, still_hired +| WHERE birth_date == TO_DATETIME("1953-09-02T00:00:00.000Z") +| SORT emp_no +| LIMIT 5; + +emp_no:integer | first_name:keyword | last_name:keyword | birth_date:date | gender:keyword | hire_date:date | languages:integer | height:double | salary:integer | still_hired:boolean +10001 | "Georgi" | "Facello" | 1953-09-02T00:00:00.000Z | "M" | 1986-06-26T00:00:00.000Z | 2 | 2.03 | 57305 | true +; + selectSpecificColumns required_capability: external_command From b3bd0a755ea721f886ac8d126853827ea1422950 Mon Sep 17 00:00:00 2001 From: Costin Leau Date: Mon, 30 Mar 2026 23:44:51 +0300 Subject: [PATCH 7/7] Add pre-epoch and list timestamp unit tests testPreEpochTimestamp: verifies 1953 date reads as negative millis, protecting the DATETIME path against DATE_NANOS regression at the reader level. testReadListTimestampColumn: exercises createListDatetimeBlock which had zero test coverage; includes a pre-epoch element. --- .../esql/datasource/orc/OrcFormatReader.java | 7 +- .../datasource/orc/OrcFormatReaderTests.java | 94 +++++++++++++++++++ 2 files changed, 98 insertions(+), 3 deletions(-) diff --git a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java index b3e62eec44b53..1691e1eb3ccb6 100644 --- a/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java +++ b/x-pack/plugin/esql-datasource-orc/src/main/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReader.java @@ -555,6 +555,7 @@ private Block createListLongBlock(ListColumnVector listCol, int rowCount) { private Block createListDoubleBlock(ListColumnVector listCol, int rowCount) { ColumnVector child = listCol.child; + double d64ScaleFactor = child instanceof Decimal64ColumnVector d64 ? Math.pow(10, d64.scale) : 0; try (var builder = blockFactory.newDoubleBlockBuilder(rowCount)) { for (int i = 0; i < rowCount; i++) { if (listCol.noNulls == false && listCol.isNull[i]) { @@ -568,7 +569,7 @@ private Block createListDoubleBlock(ListColumnVector listCol, int rowCount) { if (child.noNulls == false && child.isNull[idx]) { builder.appendDouble(0.0); } else { - builder.appendDouble(readDoubleFrom(child, idx)); + builder.appendDouble(readDoubleFrom(child, idx, d64ScaleFactor)); } } builder.endPositionEntry(); @@ -578,13 +579,13 @@ private Block createListDoubleBlock(ListColumnVector listCol, int rowCount) { } } - private static double readDoubleFrom(ColumnVector vector, int idx) { + private static double readDoubleFrom(ColumnVector vector, int idx, double d64ScaleFactor) { if (vector instanceof DoubleColumnVector dv) { return dv.vector[idx]; } else if (vector instanceof DecimalColumnVector decV) { return decV.vector[idx].doubleValue(); } else if (vector instanceof Decimal64ColumnVector d64) { - return d64.vector[idx] / Math.pow(10, d64.scale); + return d64.vector[idx] / d64ScaleFactor; } throw new QlIllegalArgumentException("Unsupported list element type: " + vector.getClass().getSimpleName()); } diff --git a/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java b/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java index c735b651cf2d0..a4e7747c439c2 100644 --- a/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java +++ b/x-pack/plugin/esql-datasource-orc/src/test/java/org/elasticsearch/xpack/esql/datasource/orc/OrcFormatReaderTests.java @@ -796,6 +796,100 @@ public void testTimestampTruncatesToMillisPrecision() throws Exception { } } + public void testPreEpochTimestamp() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("id", TypeDescription.createLong()) + .addField("birth_date", TypeDescription.createTimestampInstant()); + + long preEpochMillis = Instant.parse("1953-09-02T00:00:00Z").toEpochMilli(); + long postEpochMillis = Instant.parse("1986-06-26T00:00:00Z").toEpochMilli(); + + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 2; + LongColumnVector idCol = (LongColumnVector) batch.cols[0]; + TimestampColumnVector tsCol = (TimestampColumnVector) batch.cols[1]; + + idCol.vector[0] = 1L; + tsCol.time[0] = preEpochMillis; + tsCol.nanos[0] = 0; + + idCol.vector[1] = 2L; + tsCol.time[1] = postEpochMillis; + tsCol.nanos[1] = 0; + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + SourceMetadata metadata = reader.metadata(storageObject); + assertEquals(DataType.DATETIME, metadata.schema().get(1).dataType()); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(2, page.getPositionCount()); + + LongBlock tsBlock = (LongBlock) page.getBlock(1); + assertTrue("pre-epoch millis should be negative", tsBlock.getLong(0) < 0); + assertEquals(preEpochMillis, tsBlock.getLong(0)); + assertEquals(postEpochMillis, tsBlock.getLong(1)); + } + } + + public void testReadListTimestampColumn() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("id", TypeDescription.createLong()) + .addField("events", TypeDescription.createList(TypeDescription.createTimestampInstant())); + + long ts1 = Instant.parse("2024-01-15T10:00:00Z").toEpochMilli(); + long ts2 = Instant.parse("2024-01-15T11:00:00Z").toEpochMilli(); + long ts3 = Instant.parse("1965-03-20T08:30:00Z").toEpochMilli(); + + byte[] orcData = createOrcFile(schema, batch -> { + batch.size = 2; + LongColumnVector idCol = (LongColumnVector) batch.cols[0]; + ListColumnVector eventsCol = (ListColumnVector) batch.cols[1]; + TimestampColumnVector eventsChild = (TimestampColumnVector) eventsCol.child; + + eventsChild.ensureSize(3, false); + idCol.vector[0] = 1L; + eventsCol.offsets[0] = 0; + eventsCol.lengths[0] = 2; + eventsChild.time[0] = ts1; + eventsChild.nanos[0] = 0; + eventsChild.time[1] = ts2; + eventsChild.nanos[1] = 0; + + idCol.vector[1] = 2L; + eventsCol.offsets[1] = 2; + eventsCol.lengths[1] = 1; + eventsChild.time[2] = ts3; + eventsChild.nanos[2] = 0; + }); + + StorageObject storageObject = createStorageObject(orcData); + OrcFormatReader reader = new OrcFormatReader(blockFactory); + + SourceMetadata metadata = reader.metadata(storageObject); + assertEquals(DataType.DATETIME, metadata.schema().get(1).dataType()); + + try (CloseableIterator iterator = reader.read(storageObject, null, 1024)) { + assertTrue(iterator.hasNext()); + Page page = iterator.next(); + + assertEquals(2, page.getPositionCount()); + + LongBlock eventsBlock = (LongBlock) page.getBlock(1); + assertEquals(2, eventsBlock.getValueCount(0)); + assertEquals(ts1, eventsBlock.getLong(0)); + assertEquals(ts2, eventsBlock.getLong(1)); + assertEquals(1, eventsBlock.getValueCount(1)); + assertTrue("pre-epoch list element should be negative", eventsBlock.getLong(2) < 0); + assertEquals(ts3, eventsBlock.getLong(2)); + } + } + public void testBinaryMapsToUnsupported() throws Exception { TypeDescription schema = TypeDescription.createStruct() .addField("id", TypeDescription.createLong())