diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 32c5e78d4f04..cc74650812e9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -78,58 +78,112 @@ jobs: run: shell: bash steps: + - name: Monitor disk usage - Initial + run: | + echo "=== Initial Disk Usage ===" + df -h / + echo "" + + - name: Remove unnecessary preinstalled software + run: | + echo "=== Cleaning up host disk space ===" + echo "Disk space before cleanup:" + df -h / + + # Clean apt cache + apt-get clean || true + + # Remove GitHub Actions tool cache + rm -rf /__t/* || true + + # Remove large packages from host filesystem (mounted at /host/) + rm -rf /host/usr/share/dotnet || true + rm -rf /host/usr/local/lib/android || true + rm -rf /host/usr/local/.ghcup || true + rm -rf /host/opt/hostedtoolcache/CodeQL || true + + echo "" + echo "Disk space after cleanup:" + df -h / + echo "" + # This is necessary so that actions/checkout can find git - name: Export conda path run: echo "/opt/conda/envs/arrow/bin" >> $GITHUB_PATH # This is necessary so that Rust can find cargo - name: Export cargo path run: echo "/root/.cargo/bin" >> $GITHUB_PATH - - name: Check rustup - run: which rustup - - name: Check cmake - run: which cmake + + # Checkout repos (using shallow clones with fetch-depth: 1) - name: Checkout Arrow uses: actions/checkout@v6 with: repository: apache/arrow submodules: true - fetch-depth: 0 + fetch-depth: 1 - name: Checkout Arrow Rust uses: actions/checkout@v6 with: path: rust submodules: true - fetch-depth: 0 + fetch-depth: 1 - name: Checkout Arrow .NET uses: actions/checkout@v6 with: repository: apache/arrow-dotnet path: dotnet + fetch-depth: 1 - name: Checkout Arrow Go uses: actions/checkout@v6 with: repository: apache/arrow-go path: go + fetch-depth: 1 - name: Checkout Arrow Java uses: actions/checkout@v6 with: repository: apache/arrow-java path: java + fetch-depth: 1 - name: Checkout Arrow JavaScript uses: actions/checkout@v6 with: repository: apache/arrow-js path: js + fetch-depth: 1 - name: Checkout Arrow nanoarrow uses: actions/checkout@v6 with: repository: apache/arrow-nanoarrow path: nanoarrow + fetch-depth: 1 + + - name: Monitor disk usage - After checkouts + run: | + echo "=== After Checkouts ===" + df -h / + echo "" + - name: Build run: conda run --no-capture-output ci/scripts/integration_arrow_build.sh $PWD /build + + - name: Monitor disk usage - After build + if: always() + run: | + echo "=== After Build ===" + df -h / + echo "" + - name: Run run: conda run --no-capture-output ci/scripts/integration_arrow.sh $PWD /build + - name: Monitor disk usage - After tests + if: always() + run: | + echo "=== After Tests ===" + df -h / + echo "" + # test FFI against the C-Data interface exposed by pyarrow pyarrow-integration-test: name: Pyarrow C Data Interface diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 0aa159fa7993..a651a860f893 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -20,6 +20,173 @@ # Historical Changelog +## [57.1.0](https://github.com/apache/arrow-rs/tree/57.1.0) (2025-11-20) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/57.0.0...57.1.0) + +**Implemented enhancements:** + +- Eliminate bound checks in filter kernels [\#8865](https://github.com/apache/arrow-rs/issues/8865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Respect page index policy option for ParquetObjectReader when it's not skip [\#8856](https://github.com/apache/arrow-rs/issues/8856) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Speed up collect\_bool and remove `unsafe` [\#8848](https://github.com/apache/arrow-rs/issues/8848) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Error reading parquet FileMetaData with empty lists encoded as element-type=0 [\#8826](https://github.com/apache/arrow-rs/issues/8826) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- ValueStatistics methods can't be used from generic context in external crate [\#8823](https://github.com/apache/arrow-rs/issues/8823) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Custom Pretty-Printing Implementation for Column when Formatting Record Batches [\#8821](https://github.com/apache/arrow-rs/issues/8821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet-concat: supports bloom filter and page index [\#8804](https://github.com/apache/arrow-rs/issues/8804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[Parquet\] virtual row number support [\#7299](https://github.com/apache/arrow-rs/issues/7299) +- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8795](https://github.com/apache/arrow-rs/issues/8795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Simplify decision logic to call `FilterBuilder::optimize` or not [\#8781](https://github.com/apache/arrow-rs/issues/8781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Add variant to arrow for DataType::{Binary, LargeBinary, BinaryView} [\#8767](https://github.com/apache/arrow-rs/issues/8767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Provide algorithm that allows zipping arrays whose values are not prealigned [\#8752](https://github.com/apache/arrow-rs/issues/8752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Parquet\] ParquetMetadataReader decodes too much metadata under point-get scenerio [\#8751](https://github.com/apache/arrow-rs/issues/8751) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `arrow-json` supports encoding binary arrays, but not decoding [\#8736](https://github.com/apache/arrow-rs/issues/8736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow `FilterPredicate` instances to be reused for RecordBatches [\#8692](https://github.com/apache/arrow-rs/issues/8692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ArrowJsonBatch::from\_batch is incomplete [\#8684](https://github.com/apache/arrow-rs/issues/8684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet-layout: More info about layout including footer size, page index, bloom filter? [\#8682](https://github.com/apache/arrow-rs/issues/8682) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rewrite `ParquetRecordBatchStream` \(async API\) in terms of the PushDecoder [\#8677](https://github.com/apache/arrow-rs/issues/8677) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[JSON\] Add encoding for binary view [\#8674](https://github.com/apache/arrow-rs/issues/8674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8670](https://github.com/apache/arrow-rs/issues/8670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support Uuid/`FixedSizeBinary(16)` shredding [\#8665](https://github.com/apache/arrow-rs/issues/8665) +- \[Parquet\]There should be an encoding counter to know how many encodings the repo supports in total [\#8662](https://github.com/apache/arrow-rs/issues/8662) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Improve `parse_data_type` for `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList`, `Union`, `Map`, `RunEndCoded`. [\#8648](https://github.com/apache/arrow-rs/issues/8648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support variant to arrow primitive support null/time/decimal\_\* [\#8637](https://github.com/apache/arrow-rs/issues/8637) +- Return error from `RleDecoder::reset` rather than panic [\#8632](https://github.com/apache/arrow-rs/issues/8632) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add bitwise ops on `BooleanBufferBuilder` and `MutableBuffer` that mutate directly the buffer [\#8618](https://github.com/apache/arrow-rs/issues/8618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Add variant\_to\_arrow Utf-8, LargeUtf8, Utf8View types support [\#8567](https://github.com/apache/arrow-rs/issues/8567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Regression: Parsing `List(Int64)` results in nullable list in 57.0.0 and a non-nullable list in 57.1.0 [\#8883](https://github.com/apache/arrow-rs/issues/8883) +- Regression: FixedSlizeList data type parsing fails on 57.1.0 [\#8880](https://github.com/apache/arrow-rs/issues/8880) +- \(dyn ArrayFormatterFactory + 'static\) can't be safely shared between threads [\#8875](https://github.com/apache/arrow-rs/issues/8875) +- RowNumber reader has wrong row group ordering [\#8864](https://github.com/apache/arrow-rs/issues/8864) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `ThriftMetadataWriter::write_column_indexes` cannot handle a `ColumnIndexMetaData::NONE` [\#8815](https://github.com/apache/arrow-rs/issues/8815) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- "Archery test With other arrows" Integration test failing on main: [\#8813](https://github.com/apache/arrow-rs/issues/8813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Parquet\] Writing in 57.0.0 seems 10% slower than 56.0.0 [\#8783](https://github.com/apache/arrow-rs/issues/8783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet reader cannot handle files with unknown logical types [\#8776](https://github.com/apache/arrow-rs/issues/8776) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- zip now treats nulls as false in provided mask regardless of the underlying bit value [\#8721](https://github.com/apache/arrow-rs/issues/8721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[avro\] Incorrect version in crate.io landing page [\#8691](https://github.com/apache/arrow-rs/issues/8691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Array: ViewType gc\(\) has bug when array sum length exceed i32::MAX [\#8681](https://github.com/apache/arrow-rs/issues/8681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet 56: encounter `error: item_reader def levels are None` when reading nested field with row filter [\#8657](https://github.com/apache/arrow-rs/issues/8657) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Degnerate and non-nullable `FixedSizeListArray`s are not handled [\#8623](https://github.com/apache/arrow-rs/issues/8623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Parquet\]Performance Degradation with RowFilter on Unsorted Columns due to Fragmented ReadPlan [\#8565](https://github.com/apache/arrow-rs/issues/8565) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- docs: Add example for creating a `MutableBuffer` from `Buffer` [\#8853](https://github.com/apache/arrow-rs/pull/8853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- docs: Add examples for creating MutableBuffer from Vec [\#8852](https://github.com/apache/arrow-rs/pull/8852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve ParquetDecoder docs [\#8802](https://github.com/apache/arrow-rs/pull/8802) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Update docs for zero copy conversion of ScalarBuffer [\#8772](https://github.com/apache/arrow-rs/pull/8772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add example to convert `PrimitiveArray` to a `Vec` [\#8771](https://github.com/apache/arrow-rs/pull/8771) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- docs: Add links for arrow-avro [\#8770](https://github.com/apache/arrow-rs/pull/8770) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Parquet\] Minor: Update comments in page decompressor [\#8764](https://github.com/apache/arrow-rs/pull/8764) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Document limitations of the `arrow_integration_test` crate [\#8738](https://github.com/apache/arrow-rs/pull/8738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) +- docs: Add link to the Arrow implementation status page [\#8732](https://github.com/apache/arrow-rs/pull/8732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- docs: Update Parquet readme implementation status [\#8731](https://github.com/apache/arrow-rs/pull/8731) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- `RowConverter::from_binary` should opportunistically take ownership of the buffer [\#8685](https://github.com/apache/arrow-rs/issues/8685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speed up filter some more \(up to 2x\) [\#8868](https://github.com/apache/arrow-rs/pull/8868) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Speed up `collect_bool` and remove `unsafe`, optimize `take_bits`, `take_native` for null values [\#8849](https://github.com/apache/arrow-rs/pull/8849) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Change `BooleanBuffer::append_packed_range` to use `apply_bitwise_binary_op` [\#8812](https://github.com/apache/arrow-rs/pull/8812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Parquet\] Avoid copying `LogicalType` in `ColumnOrder::get_sort_order`, deprecate `get_logical_type` [\#8789](https://github.com/apache/arrow-rs/pull/8789) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- perf: Speed up Parquet file writing \(10%, back to speed of 56\) [\#8786](https://github.com/apache/arrow-rs/pull/8786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- perf: override `ArrayIter` default impl for `nth`, `nth_back`, `last` and `count` [\#8785](https://github.com/apache/arrow-rs/pull/8785) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Parquet\] Reduce one copy in `SerializedPageReader` [\#8745](https://github.com/apache/arrow-rs/pull/8745) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Small optimization in Parquet varint decoder [\#8742](https://github.com/apache/arrow-rs/pull/8742) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- perf: override `count`, `nth`, `nth_back`, `last` and `max` for BitIterator [\#8696](https://github.com/apache/arrow-rs/pull/8696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Add `FilterPredicate::filter_record_batch` [\#8693](https://github.com/apache/arrow-rs/pull/8693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) +- perf: zero-copy path in `RowConverter::from_binary` [\#8686](https://github.com/apache/arrow-rs/pull/8686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mzabaluev](https://github.com/mzabaluev)) +- perf: add optimized zip implementation for scalars [\#8653](https://github.com/apache/arrow-rs/pull/8653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- feat: add `apply_unary_op` and `apply_binary_op` bitwise operations [\#8619](https://github.com/apache/arrow-rs/pull/8619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Parquet\]Optimize the performance in record reader [\#8607](https://github.com/apache/arrow-rs/pull/8607) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) + +**Closed issues:** + +- Variant to NullType conversion ignores strict casting [\#8810](https://github.com/apache/arrow-rs/issues/8810) +- Unify display representation for `Field` [\#8784](https://github.com/apache/arrow-rs/issues/8784) +- Misleading configuration name: skip\_arrow\_metadata [\#8780](https://github.com/apache/arrow-rs/issues/8780) +- Inconsistent display for types with Metadata [\#8761](https://github.com/apache/arrow-rs/issues/8761) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Internal `arrow-integration-test` crate is linked from `arrow` docs [\#8739](https://github.com/apache/arrow-rs/issues/8739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add benchmark for RunEndEncoded casting [\#8709](https://github.com/apache/arrow-rs/issues/8709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Varaint\] Support `VariantArray::value` to return a `Result` [\#8672](https://github.com/apache/arrow-rs/issues/8672) + +**Merged pull requests:** + +- Fix regression caused by changes in Display for DataType - display \(`List(non-null Int64)` instead of `List(nullable Int64)` [\#8890](https://github.com/apache/arrow-rs/pull/8890) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([etseidl](https://github.com/etseidl)) +- Support parsing for old style FixedSizeList [\#8882](https://github.com/apache/arrow-rs/pull/8882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Make ArrayFormatterFactory Send + Sync and add a test [\#8878](https://github.com/apache/arrow-rs/pull/8878) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- Make `ArrowReaderOptions::with_virtual_columns` error rather than panic on invalid input [\#8867](https://github.com/apache/arrow-rs/pull/8867) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Fix errors when reading nested Lists with pushdown predicates. [\#8866](https://github.com/apache/arrow-rs/pull/8866) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Fix `RowNumberReader` when not all row groups are selected [\#8863](https://github.com/apache/arrow-rs/pull/8863) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) +- Respect page index policy option for ParquetObjectReader when it's not skip [\#8857](https://github.com/apache/arrow-rs/pull/8857) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zhuqi-lucas](https://github.com/zhuqi-lucas)) +- build\(deps\): update apache-avro requirement from 0.20.0 to 0.21.0 [\#8832](https://github.com/apache/arrow-rs/pull/8832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow Users to Provide Custom `ArrayFormatter`s when Pretty-Printing Record Batches [\#8829](https://github.com/apache/arrow-rs/pull/8829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- Allow reading of improperly constructed empty lists in Parquet metadata [\#8827](https://github.com/apache/arrow-rs/pull/8827) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- \[Variant\] Fix cast logic for Variant to Arrow for DataType::Null [\#8825](https://github.com/apache/arrow-rs/pull/8825) ([klion26](https://github.com/klion26)) +- remove T: ParquetValueType bound on ValueStatistics [\#8824](https://github.com/apache/arrow-rs/pull/8824) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([pmarks](https://github.com/pmarks)) +- build\(deps\): update lz4\_flex requirement from 0.11 to 0.12 [\#8820](https://github.com/apache/arrow-rs/pull/8820) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix bug in handling of empty Parquet page index structures [\#8817](https://github.com/apache/arrow-rs/pull/8817) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Parquet-concat: supports page index and bloom filter [\#8811](https://github.com/apache/arrow-rs/pull/8811) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) +- \[Doc\] Correct `ListArray` documentation [\#8803](https://github.com/apache/arrow-rs/pull/8803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) +- \[Parquet\] Add additional docs for `ArrowReaderOptions` and `ArrowReaderMetadata` [\#8798](https://github.com/apache/arrow-rs/pull/8798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8796](https://github.com/apache/arrow-rs/pull/8796) ([liamzwbao](https://github.com/liamzwbao)) +- Add `VariantPath::is_empty` [\#8791](https://github.com/apache/arrow-rs/pull/8791) ([friendlymatthew](https://github.com/friendlymatthew)) +- Add FilterBuilder::is\_optimize\_beneficial [\#8782](https://github.com/apache/arrow-rs/pull/8782) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) +- \[Parquet\] Allow reading of files with unknown logical types [\#8777](https://github.com/apache/arrow-rs/pull/8777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- bench: add `ArrayIter` benchmarks [\#8774](https://github.com/apache/arrow-rs/pull/8774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Update Rust toolchain to 1.91 [\#8769](https://github.com/apache/arrow-rs/pull/8769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- \[Variant\] Add variant to arrow for `DataType::{Binary/LargeBinary/BinaryView}` [\#8768](https://github.com/apache/arrow-rs/pull/8768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([klion26](https://github.com/klion26)) +- feat: parse `DataType::Union`, `DataType::Map`, `DataType::RunEndEncoded` [\#8765](https://github.com/apache/arrow-rs/pull/8765) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- Add options to control various aspects of Parquet metadata decoding [\#8763](https://github.com/apache/arrow-rs/pull/8763) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- feat: Ensure consistent metadata display for data types [\#8760](https://github.com/apache/arrow-rs/pull/8760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mhilton](https://github.com/mhilton)) +- Clean up predicate\_cache tests [\#8755](https://github.com/apache/arrow-rs/pull/8755) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- refactor `test_cache_projection_excludes_nested_columns` to use high level APIs [\#8754](https://github.com/apache/arrow-rs/pull/8754) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add `merge` and `merge_n` kernels [\#8753](https://github.com/apache/arrow-rs/pull/8753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) +- Fix lint in arrow-flight by updating assert\_cmd after it upgraded [\#8741](https://github.com/apache/arrow-rs/pull/8741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([vegarsti](https://github.com/vegarsti)) +- Remove link to internal `arrow-integration-test` crate from main `arrow` crate [\#8740](https://github.com/apache/arrow-rs/pull/8740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) +- Implement hex decoding of JSON strings to binary arrays [\#8737](https://github.com/apache/arrow-rs/pull/8737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) +- \[Parquet\] Adaptive Parquet Predicate Pushdown [\#8733](https://github.com/apache/arrow-rs/pull/8733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) +- \[Parquet\] Return error from `RleDecoder::reload` rather than panic [\#8729](https://github.com/apache/arrow-rs/pull/8729) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liamzwbao](https://github.com/liamzwbao)) +- fix: `ArrayIter` does not report size hint correctly after advancing from the iterator back [\#8728](https://github.com/apache/arrow-rs/pull/8728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- perf: Use Vec::with\_capacity in cast\_to\_run\_end\_encoded [\#8726](https://github.com/apache/arrow-rs/pull/8726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) +- \[Variant\] Fix the index of an item in VariantArray in a unit test [\#8725](https://github.com/apache/arrow-rs/pull/8725) ([martin-g](https://github.com/martin-g)) +- build\(deps\): bump actions/download-artifact from 5 to 6 [\#8720](https://github.com/apache/arrow-rs/pull/8720) ([dependabot[bot]](https://github.com/apps/dependabot)) +- \[Variant\] Add try\_value/value for VariantArray [\#8719](https://github.com/apache/arrow-rs/pull/8719) ([klion26](https://github.com/klion26)) +- General virtual columns support + row numbers as a first use-case [\#8715](https://github.com/apache/arrow-rs/pull/8715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) +- feat: Parquet-layout add Index and Footer info [\#8712](https://github.com/apache/arrow-rs/pull/8712) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) +- fix: `zip` now treats nulls as false in provided mask regardless of the underlying bit value [\#8711](https://github.com/apache/arrow-rs/pull/8711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Add benchmark for casting to RunEndEncoded \(REE\) [\#8710](https://github.com/apache/arrow-rs/pull/8710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) +- \[Minor\]: Document visibility for enums produced by Thrift macros [\#8706](https://github.com/apache/arrow-rs/pull/8706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Update `arrow-avro` `README.md` version to 57 [\#8695](https://github.com/apache/arrow-rs/pull/8695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) +- Fix: ViewType gc on huge batch would produce bad output [\#8694](https://github.com/apache/arrow-rs/pull/8694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) +- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8689](https://github.com/apache/arrow-rs/pull/8689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) +- check bit width to avoid panic in DeltaBitPackDecoder [\#8688](https://github.com/apache/arrow-rs/pull/8688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) +- \[thrift-remodel\] Use `thrift_enum` macro for `ConvertedType` [\#8680](https://github.com/apache/arrow-rs/pull/8680) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- \[JSON\] Map key supports utf8 view [\#8679](https://github.com/apache/arrow-rs/pull/8679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) +- \[JSON\] Add encoding for binary view [\#8675](https://github.com/apache/arrow-rs/pull/8675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) +- \[Parquet\] Account for FileDecryptor in ParquetMetaData heap size calculation [\#8671](https://github.com/apache/arrow-rs/pull/8671) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([adamreeve](https://github.com/adamreeve)) +- chore: update `OffsetBuffer::from_lengths(std::iter::repeat_n(, ));` with `OffsetBuffer::from_repeated_length(, );` [\#8669](https://github.com/apache/arrow-rs/pull/8669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Variant\] Support `shred_variant` for Uuids [\#8666](https://github.com/apache/arrow-rs/pull/8666) ([friendlymatthew](https://github.com/friendlymatthew)) +- \[Variant\] Remove `create_test_variant_array` helper method [\#8664](https://github.com/apache/arrow-rs/pull/8664) ([friendlymatthew](https://github.com/friendlymatthew)) +- \[parquet\] Adding counting method in thrift\_enum macro to support ENCODING\_SLOTS [\#8663](https://github.com/apache/arrow-rs/pull/8663) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) +- chore: add test case of RowSelection::trim [\#8660](https://github.com/apache/arrow-rs/pull/8660) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([lichuang](https://github.com/lichuang)) +- feat: add `new_repeated` to `ByteArray` [\#8659](https://github.com/apache/arrow-rs/pull/8659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- perf: add `repeat_slice_n_times` to `MutableBuffer` [\#8658](https://github.com/apache/arrow-rs/pull/8658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- perf: add optimized function to create offset with same length [\#8656](https://github.com/apache/arrow-rs/pull/8656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Variant\] `rescale_decimal` followup [\#8655](https://github.com/apache/arrow-rs/pull/8655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) +- feat: parse DataType `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList` [\#8649](https://github.com/apache/arrow-rs/pull/8649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- Support more operations on ListView [\#8645](https://github.com/apache/arrow-rs/pull/8645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([a10y](https://github.com/a10y)) +- \[Variant\] Implement primitive type access for null/time/decimal\* [\#8638](https://github.com/apache/arrow-rs/pull/8638) ([klion26](https://github.com/klion26)) +- \[Variant\] refactor: Split builder.rs into several smaller files [\#8635](https://github.com/apache/arrow-rs/pull/8635) ([Weijun-H](https://github.com/Weijun-H)) +- add `try_new_with_length` constructor to `FixedSizeList` [\#8624](https://github.com/apache/arrow-rs/pull/8624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([connortsui20](https://github.com/connortsui20)) +- Change some panics to errors in parquet decoder [\#8602](https://github.com/apache/arrow-rs/pull/8602) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) +- Support `variant_to_arrow` for utf8 [\#8600](https://github.com/apache/arrow-rs/pull/8600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sdf-jkl](https://github.com/sdf-jkl)) +- Cast support for RunEndEncoded arrays [\#8589](https://github.com/apache/arrow-rs/pull/8589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) + + + ## [57.0.0](https://github.com/apache/arrow-rs/tree/57.0.0) (2025-10-19) [Full Changelog](https://github.com/apache/arrow-rs/compare/56.2.0...57.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 240e9681c2ef..fbbdba7d36ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,170 +19,172 @@ # Changelog -## [57.1.0](https://github.com/apache/arrow-rs/tree/57.1.0) (2025-11-20) +## [57.2.0](https://github.com/apache/arrow-rs/tree/57.2.0) (2026-01-07) -[Full Changelog](https://github.com/apache/arrow-rs/compare/57.0.0...57.1.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/57.1.0...57.2.0) + +**Breaking changes:** + +- Seal Array trait [\#9092](https://github.com/apache/arrow-rs/pull/9092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- \[Variant\] Unify the CastOptions usage in parquet-variant-compute [\#8984](https://github.com/apache/arrow-rs/pull/8984) ([klion26](https://github.com/klion26)) **Implemented enhancements:** -- Eliminate bound checks in filter kernels [\#8865](https://github.com/apache/arrow-rs/issues/8865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Respect page index policy option for ParquetObjectReader when it's not skip [\#8856](https://github.com/apache/arrow-rs/issues/8856) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Speed up collect\_bool and remove `unsafe` [\#8848](https://github.com/apache/arrow-rs/issues/8848) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Error reading parquet FileMetaData with empty lists encoded as element-type=0 [\#8826](https://github.com/apache/arrow-rs/issues/8826) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- ValueStatistics methods can't be used from generic context in external crate [\#8823](https://github.com/apache/arrow-rs/issues/8823) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Custom Pretty-Printing Implementation for Column when Formatting Record Batches [\#8821](https://github.com/apache/arrow-rs/issues/8821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet-concat: supports bloom filter and page index [\#8804](https://github.com/apache/arrow-rs/issues/8804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- \[Parquet\] virtual row number support [\#7299](https://github.com/apache/arrow-rs/issues/7299) -- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8795](https://github.com/apache/arrow-rs/issues/8795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Simplify decision logic to call `FilterBuilder::optimize` or not [\#8781](https://github.com/apache/arrow-rs/issues/8781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Add variant to arrow for DataType::{Binary, LargeBinary, BinaryView} [\#8767](https://github.com/apache/arrow-rs/issues/8767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Provide algorithm that allows zipping arrays whose values are not prealigned [\#8752](https://github.com/apache/arrow-rs/issues/8752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Parquet\] ParquetMetadataReader decodes too much metadata under point-get scenerio [\#8751](https://github.com/apache/arrow-rs/issues/8751) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `arrow-json` supports encoding binary arrays, but not decoding [\#8736](https://github.com/apache/arrow-rs/issues/8736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow `FilterPredicate` instances to be reused for RecordBatches [\#8692](https://github.com/apache/arrow-rs/issues/8692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ArrowJsonBatch::from\_batch is incomplete [\#8684](https://github.com/apache/arrow-rs/issues/8684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet-layout: More info about layout including footer size, page index, bloom filter? [\#8682](https://github.com/apache/arrow-rs/issues/8682) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Rewrite `ParquetRecordBatchStream` \(async API\) in terms of the PushDecoder [\#8677](https://github.com/apache/arrow-rs/issues/8677) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- \[JSON\] Add encoding for binary view [\#8674](https://github.com/apache/arrow-rs/issues/8674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8670](https://github.com/apache/arrow-rs/issues/8670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Support Uuid/`FixedSizeBinary(16)` shredding [\#8665](https://github.com/apache/arrow-rs/issues/8665) -- \[Parquet\]There should be an encoding counter to know how many encodings the repo supports in total [\#8662](https://github.com/apache/arrow-rs/issues/8662) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Improve `parse_data_type` for `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList`, `Union`, `Map`, `RunEndCoded`. [\#8648](https://github.com/apache/arrow-rs/issues/8648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Support variant to arrow primitive support null/time/decimal\_\* [\#8637](https://github.com/apache/arrow-rs/issues/8637) -- Return error from `RleDecoder::reset` rather than panic [\#8632](https://github.com/apache/arrow-rs/issues/8632) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add bitwise ops on `BooleanBufferBuilder` and `MutableBuffer` that mutate directly the buffer [\#8618](https://github.com/apache/arrow-rs/issues/8618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Add variant\_to\_arrow Utf-8, LargeUtf8, Utf8View types support [\#8567](https://github.com/apache/arrow-rs/issues/8567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[parquet\] further relax `LevelInfoBuilder::types_compatible` for `ArrowWriter` [\#9098](https://github.com/apache/arrow-rs/issues/9098) +- Update arrow-row documentation with Union encoding [\#9084](https://github.com/apache/arrow-rs/issues/9084) +- Add code examples for min and max compute functions [\#9055](https://github.com/apache/arrow-rs/issues/9055) +- Add `append_n` to bytes view builder API [\#9034](https://github.com/apache/arrow-rs/issues/9034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Move `RunArray::get_physical_indices` to `RunEndBuffer` [\#9025](https://github.com/apache/arrow-rs/issues/9025) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow quote style in csv writer [\#9003](https://github.com/apache/arrow-rs/issues/9003) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- IPC support for ListView [\#9002](https://github.com/apache/arrow-rs/issues/9002) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `BinaryArrayType` for `&FixedSizeBinaryArray`s [\#8992](https://github.com/apache/arrow-rs/issues/8992) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-buffer: implement num-traits for i256 [\#8976](https://github.com/apache/arrow-rs/issues/8976) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support for `Arc` in `ParquetRecordWriter` derive macro [\#8972](https://github.com/apache/arrow-rs/issues/8972) +- \[arrow-avro\] suggest switching from xz to liblzma [\#8970](https://github.com/apache/arrow-rs/issues/8970) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-buffer: add i256::trailing\_zeros [\#8968](https://github.com/apache/arrow-rs/issues/8968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-buffer: make i256::leading\_zeros public [\#8965](https://github.com/apache/arrow-rs/issues/8965) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add spark like `ignoreLeadingWhiteSpace` and `ignoreTrailingWhiteSpace` options to the csv writer [\#8961](https://github.com/apache/arrow-rs/issues/8961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add round trip benchmark for Parquet writer/reader [\#8955](https://github.com/apache/arrow-rs/issues/8955) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support performant `interleave` for List/LargeList [\#8952](https://github.com/apache/arrow-rs/issues/8952) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support array access when parsing `VariantPath` [\#8946](https://github.com/apache/arrow-rs/issues/8946) +- Some panic!s could be represented as unimplemented!s [\#8932](https://github.com/apache/arrow-rs/issues/8932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] easier way to construct a shredded schema [\#8922](https://github.com/apache/arrow-rs/issues/8922) +- Support `DataType::ListView` and `DataType::LargeListView` in `ArrayData::new_null` [\#8908](https://github.com/apache/arrow-rs/issues/8908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `GenericListViewArray::from_iter_primitive` [\#8906](https://github.com/apache/arrow-rs/issues/8906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Unify the cast option usage in ParquentVariant [\#8873](https://github.com/apache/arrow-rs/issues/8873) +- Blog post about efficient filter representation in Parquet filter pushdown [\#8843](https://github.com/apache/arrow-rs/issues/8843) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add comparison support for Union arrays in the `cmp` kernel [\#8837](https://github.com/apache/arrow-rs/issues/8837) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support array shredding into `List/LargeList/ListView/LargeListView` [\#8830](https://github.com/apache/arrow-rs/issues/8830) +- Support `Union` data types for row format [\#8828](https://github.com/apache/arrow-rs/issues/8828) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FFI support for ListView [\#8819](https://github.com/apache/arrow-rs/issues/8819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support more Arrow Datatypes from Variant primitive types [\#8805](https://github.com/apache/arrow-rs/issues/8805) +- `FixedSizeBinaryBuilder` supports `append_array` [\#8750](https://github.com/apache/arrow-rs/issues/8750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement special case `zip` with scalar for Utf8View [\#8724](https://github.com/apache/arrow-rs/issues/8724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[geometry\] Wire up arrow reader/writer for `GEOMETRY` and `GEOGRAPHY` [\#8717](https://github.com/apache/arrow-rs/issues/8717) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Regression: Parsing `List(Int64)` results in nullable list in 57.0.0 and a non-nullable list in 57.1.0 [\#8883](https://github.com/apache/arrow-rs/issues/8883) -- Regression: FixedSlizeList data type parsing fails on 57.1.0 [\#8880](https://github.com/apache/arrow-rs/issues/8880) -- \(dyn ArrayFormatterFactory + 'static\) can't be safely shared between threads [\#8875](https://github.com/apache/arrow-rs/issues/8875) -- RowNumber reader has wrong row group ordering [\#8864](https://github.com/apache/arrow-rs/issues/8864) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `ThriftMetadataWriter::write_column_indexes` cannot handle a `ColumnIndexMetaData::NONE` [\#8815](https://github.com/apache/arrow-rs/issues/8815) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- "Archery test With other arrows" Integration test failing on main: [\#8813](https://github.com/apache/arrow-rs/issues/8813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Parquet\] Writing in 57.0.0 seems 10% slower than 56.0.0 [\#8783](https://github.com/apache/arrow-rs/issues/8783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet reader cannot handle files with unknown logical types [\#8776](https://github.com/apache/arrow-rs/issues/8776) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- zip now treats nulls as false in provided mask regardless of the underlying bit value [\#8721](https://github.com/apache/arrow-rs/issues/8721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[avro\] Incorrect version in crate.io landing page [\#8691](https://github.com/apache/arrow-rs/issues/8691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Array: ViewType gc\(\) has bug when array sum length exceed i32::MAX [\#8681](https://github.com/apache/arrow-rs/issues/8681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet 56: encounter `error: item_reader def levels are None` when reading nested field with row filter [\#8657](https://github.com/apache/arrow-rs/issues/8657) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Degnerate and non-nullable `FixedSizeListArray`s are not handled [\#8623](https://github.com/apache/arrow-rs/issues/8623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Parquet\]Performance Degradation with RowFilter on Unsorted Columns due to Fragmented ReadPlan [\#8565](https://github.com/apache/arrow-rs/issues/8565) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Soundness Bug in `try_binary` when `Array` is implemented incorrectly in external crate [\#9106](https://github.com/apache/arrow-rs/issues/9106) +- casting `Dict(_, LargeUtf8)` to `Utf8View` \(`StringViewArray`\) panics [\#9101](https://github.com/apache/arrow-rs/issues/9101) +- wrong results for null count of `nullif` kernel [\#9085](https://github.com/apache/arrow-rs/issues/9085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Empty first line in some code examples [\#9063](https://github.com/apache/arrow-rs/issues/9063) +- GenericByteViewArray::slice is not zero-copy but ought to be [\#9014](https://github.com/apache/arrow-rs/issues/9014) +- Regression in struct casting in 57.2.0 \(not yet released\) [\#9005](https://github.com/apache/arrow-rs/issues/9005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix panic when decoding multiple Union columns in RowConverter [\#8999](https://github.com/apache/arrow-rs/issues/8999) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `take_fixed_size_binary` Does Not Consider NULL Indices [\#8947](https://github.com/apache/arrow-rs/issues/8947) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[arrow-avro\] RecordEncoder Bugs [\#8934](https://github.com/apache/arrow-rs/issues/8934) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `FixedSizeBinaryArray::try_new(...)` Panics with Item Length of Zero [\#8926](https://github.com/apache/arrow-rs/issues/8926) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `cargo test -p arrow-cast` fails on main [\#8910](https://github.com/apache/arrow-rs/issues/8910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `GenericListViewArray::new_null` ignores `len` and returns an empty array [\#8904](https://github.com/apache/arrow-rs/issues/8904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `FixedSizeBinaryArray::new_null` Does Not Properly Set the Length of the Values Buffer [\#8900](https://github.com/apache/arrow-rs/issues/8900) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Struct casting requires same order of fields [\#8870](https://github.com/apache/arrow-rs/issues/8870) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cannot cast string dictionary to binary view [\#8841](https://github.com/apache/arrow-rs/issues/8841) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Documentation updates:** -- docs: Add example for creating a `MutableBuffer` from `Buffer` [\#8853](https://github.com/apache/arrow-rs/pull/8853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- docs: Add examples for creating MutableBuffer from Vec [\#8852](https://github.com/apache/arrow-rs/pull/8852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve ParquetDecoder docs [\#8802](https://github.com/apache/arrow-rs/pull/8802) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Update docs for zero copy conversion of ScalarBuffer [\#8772](https://github.com/apache/arrow-rs/pull/8772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add example to convert `PrimitiveArray` to a `Vec` [\#8771](https://github.com/apache/arrow-rs/pull/8771) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- docs: Add links for arrow-avro [\#8770](https://github.com/apache/arrow-rs/pull/8770) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- \[Parquet\] Minor: Update comments in page decompressor [\#8764](https://github.com/apache/arrow-rs/pull/8764) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Document limitations of the `arrow_integration_test` crate [\#8738](https://github.com/apache/arrow-rs/pull/8738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) -- docs: Add link to the Arrow implementation status page [\#8732](https://github.com/apache/arrow-rs/pull/8732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- docs: Update Parquet readme implementation status [\#8731](https://github.com/apache/arrow-rs/pull/8731) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add Union encoding documentation [\#9102](https://github.com/apache/arrow-rs/pull/9102) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([EduardAkhmetshin](https://github.com/EduardAkhmetshin)) +- docs: fix misleading reserve documentation [\#9076](https://github.com/apache/arrow-rs/pull/9076) ([WaterWhisperer](https://github.com/WaterWhisperer)) +- Fix headers and empty lines in code examples [\#9064](https://github.com/apache/arrow-rs/pull/9064) ([EduardAkhmetshin](https://github.com/EduardAkhmetshin)) +- Add examples for min and max functions [\#9062](https://github.com/apache/arrow-rs/pull/9062) ([EduardAkhmetshin](https://github.com/EduardAkhmetshin)) +- Improve arrow-buffer documentation [\#9020](https://github.com/apache/arrow-rs/pull/9020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Move examples in arrow-csv to docstrings, polish up docs [\#9001](https://github.com/apache/arrow-rs/pull/9001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add example of parsing field names as VariantPath [\#8945](https://github.com/apache/arrow-rs/pull/8945) ([alamb](https://github.com/alamb)) +- Improve documentation for `prep\_null\_mask\_flter [\#8722](https://github.com/apache/arrow-rs/pull/8722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) **Performance improvements:** -- `RowConverter::from_binary` should opportunistically take ownership of the buffer [\#8685](https://github.com/apache/arrow-rs/issues/8685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Speed up filter some more \(up to 2x\) [\#8868](https://github.com/apache/arrow-rs/pull/8868) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Speed up `collect_bool` and remove `unsafe`, optimize `take_bits`, `take_native` for null values [\#8849](https://github.com/apache/arrow-rs/pull/8849) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Change `BooleanBuffer::append_packed_range` to use `apply_bitwise_binary_op` [\#8812](https://github.com/apache/arrow-rs/pull/8812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- \[Parquet\] Avoid copying `LogicalType` in `ColumnOrder::get_sort_order`, deprecate `get_logical_type` [\#8789](https://github.com/apache/arrow-rs/pull/8789) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- perf: Speed up Parquet file writing \(10%, back to speed of 56\) [\#8786](https://github.com/apache/arrow-rs/pull/8786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- perf: override `ArrayIter` default impl for `nth`, `nth_back`, `last` and `count` [\#8785](https://github.com/apache/arrow-rs/pull/8785) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Parquet\] Reduce one copy in `SerializedPageReader` [\#8745](https://github.com/apache/arrow-rs/pull/8745) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) -- Small optimization in Parquet varint decoder [\#8742](https://github.com/apache/arrow-rs/pull/8742) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- perf: override `count`, `nth`, `nth_back`, `last` and `max` for BitIterator [\#8696](https://github.com/apache/arrow-rs/pull/8696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- Add `FilterPredicate::filter_record_batch` [\#8693](https://github.com/apache/arrow-rs/pull/8693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) -- perf: zero-copy path in `RowConverter::from_binary` [\#8686](https://github.com/apache/arrow-rs/pull/8686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mzabaluev](https://github.com/mzabaluev)) -- perf: add optimized zip implementation for scalars [\#8653](https://github.com/apache/arrow-rs/pull/8653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- feat: add `apply_unary_op` and `apply_binary_op` bitwise operations [\#8619](https://github.com/apache/arrow-rs/pull/8619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Parquet\]Optimize the performance in record reader [\#8607](https://github.com/apache/arrow-rs/pull/8607) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) +- \[parquet\] Avoid a clone while resolving the read strategy [\#9056](https://github.com/apache/arrow-rs/pull/9056) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- perf: improve performance of encoding `GenericByteArray` by 8% [\#9054](https://github.com/apache/arrow-rs/pull/9054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Speed up unary `not` kernel by 50%, add `BooleanBuffer::from_bitwise_unary` [\#8996](https://github.com/apache/arrow-rs/pull/8996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- arrow-select: improve dictionary interleave fallback performance [\#8978](https://github.com/apache/arrow-rs/pull/8978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asubiotto](https://github.com/asubiotto)) +- Add special implementation for zip for Utf8View/BinaryView scalars [\#8963](https://github.com/apache/arrow-rs/pull/8963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mkleen](https://github.com/mkleen)) +- arrow-select: implement specialized interleave\_list [\#8953](https://github.com/apache/arrow-rs/pull/8953) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asubiotto](https://github.com/asubiotto)) **Closed issues:** -- Variant to NullType conversion ignores strict casting [\#8810](https://github.com/apache/arrow-rs/issues/8810) -- Unify display representation for `Field` [\#8784](https://github.com/apache/arrow-rs/issues/8784) -- Misleading configuration name: skip\_arrow\_metadata [\#8780](https://github.com/apache/arrow-rs/issues/8780) -- Inconsistent display for types with Metadata [\#8761](https://github.com/apache/arrow-rs/issues/8761) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Internal `arrow-integration-test` crate is linked from `arrow` docs [\#8739](https://github.com/apache/arrow-rs/issues/8739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add benchmark for RunEndEncoded casting [\#8709](https://github.com/apache/arrow-rs/issues/8709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Varaint\] Support `VariantArray::value` to return a `Result` [\#8672](https://github.com/apache/arrow-rs/issues/8672) +- impl `Index` for `UnionFields` [\#8958](https://github.com/apache/arrow-rs/issues/8958) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Fix regression caused by changes in Display for DataType - display \(`List(non-null Int64)` instead of `List(nullable Int64)` [\#8890](https://github.com/apache/arrow-rs/pull/8890) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([etseidl](https://github.com/etseidl)) -- Support parsing for old style FixedSizeList [\#8882](https://github.com/apache/arrow-rs/pull/8882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Make ArrayFormatterFactory Send + Sync and add a test [\#8878](https://github.com/apache/arrow-rs/pull/8878) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) -- Make `ArrowReaderOptions::with_virtual_columns` error rather than panic on invalid input [\#8867](https://github.com/apache/arrow-rs/pull/8867) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Fix errors when reading nested Lists with pushdown predicates. [\#8866](https://github.com/apache/arrow-rs/pull/8866) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Fix `RowNumberReader` when not all row groups are selected [\#8863](https://github.com/apache/arrow-rs/pull/8863) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) -- Respect page index policy option for ParquetObjectReader when it's not skip [\#8857](https://github.com/apache/arrow-rs/pull/8857) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zhuqi-lucas](https://github.com/zhuqi-lucas)) -- build\(deps\): update apache-avro requirement from 0.20.0 to 0.21.0 [\#8832](https://github.com/apache/arrow-rs/pull/8832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Allow Users to Provide Custom `ArrayFormatter`s when Pretty-Printing Record Batches [\#8829](https://github.com/apache/arrow-rs/pull/8829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) -- Allow reading of improperly constructed empty lists in Parquet metadata [\#8827](https://github.com/apache/arrow-rs/pull/8827) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- \[Variant\] Fix cast logic for Variant to Arrow for DataType::Null [\#8825](https://github.com/apache/arrow-rs/pull/8825) ([klion26](https://github.com/klion26)) -- remove T: ParquetValueType bound on ValueStatistics [\#8824](https://github.com/apache/arrow-rs/pull/8824) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([pmarks](https://github.com/pmarks)) -- build\(deps\): update lz4\_flex requirement from 0.11 to 0.12 [\#8820](https://github.com/apache/arrow-rs/pull/8820) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix bug in handling of empty Parquet page index structures [\#8817](https://github.com/apache/arrow-rs/pull/8817) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Parquet-concat: supports page index and bloom filter [\#8811](https://github.com/apache/arrow-rs/pull/8811) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) -- \[Doc\] Correct `ListArray` documentation [\#8803](https://github.com/apache/arrow-rs/pull/8803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) -- \[Parquet\] Add additional docs for `ArrowReaderOptions` and `ArrowReaderMetadata` [\#8798](https://github.com/apache/arrow-rs/pull/8798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8796](https://github.com/apache/arrow-rs/pull/8796) ([liamzwbao](https://github.com/liamzwbao)) -- Add `VariantPath::is_empty` [\#8791](https://github.com/apache/arrow-rs/pull/8791) ([friendlymatthew](https://github.com/friendlymatthew)) -- Add FilterBuilder::is\_optimize\_beneficial [\#8782](https://github.com/apache/arrow-rs/pull/8782) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) -- \[Parquet\] Allow reading of files with unknown logical types [\#8777](https://github.com/apache/arrow-rs/pull/8777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- bench: add `ArrayIter` benchmarks [\#8774](https://github.com/apache/arrow-rs/pull/8774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- Update Rust toolchain to 1.91 [\#8769](https://github.com/apache/arrow-rs/pull/8769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- \[Variant\] Add variant to arrow for `DataType::{Binary/LargeBinary/BinaryView}` [\#8768](https://github.com/apache/arrow-rs/pull/8768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([klion26](https://github.com/klion26)) -- feat: parse `DataType::Union`, `DataType::Map`, `DataType::RunEndEncoded` [\#8765](https://github.com/apache/arrow-rs/pull/8765) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) -- Add options to control various aspects of Parquet metadata decoding [\#8763](https://github.com/apache/arrow-rs/pull/8763) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- feat: Ensure consistent metadata display for data types [\#8760](https://github.com/apache/arrow-rs/pull/8760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mhilton](https://github.com/mhilton)) -- Clean up predicate\_cache tests [\#8755](https://github.com/apache/arrow-rs/pull/8755) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- refactor `test_cache_projection_excludes_nested_columns` to use high level APIs [\#8754](https://github.com/apache/arrow-rs/pull/8754) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Add `merge` and `merge_n` kernels [\#8753](https://github.com/apache/arrow-rs/pull/8753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) -- Fix lint in arrow-flight by updating assert\_cmd after it upgraded [\#8741](https://github.com/apache/arrow-rs/pull/8741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([vegarsti](https://github.com/vegarsti)) -- Remove link to internal `arrow-integration-test` crate from main `arrow` crate [\#8740](https://github.com/apache/arrow-rs/pull/8740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) -- Implement hex decoding of JSON strings to binary arrays [\#8737](https://github.com/apache/arrow-rs/pull/8737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) -- \[Parquet\] Adaptive Parquet Predicate Pushdown [\#8733](https://github.com/apache/arrow-rs/pull/8733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) -- \[Parquet\] Return error from `RleDecoder::reload` rather than panic [\#8729](https://github.com/apache/arrow-rs/pull/8729) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liamzwbao](https://github.com/liamzwbao)) -- fix: `ArrayIter` does not report size hint correctly after advancing from the iterator back [\#8728](https://github.com/apache/arrow-rs/pull/8728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- perf: Use Vec::with\_capacity in cast\_to\_run\_end\_encoded [\#8726](https://github.com/apache/arrow-rs/pull/8726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) -- \[Variant\] Fix the index of an item in VariantArray in a unit test [\#8725](https://github.com/apache/arrow-rs/pull/8725) ([martin-g](https://github.com/martin-g)) -- build\(deps\): bump actions/download-artifact from 5 to 6 [\#8720](https://github.com/apache/arrow-rs/pull/8720) ([dependabot[bot]](https://github.com/apps/dependabot)) -- \[Variant\] Add try\_value/value for VariantArray [\#8719](https://github.com/apache/arrow-rs/pull/8719) ([klion26](https://github.com/klion26)) -- General virtual columns support + row numbers as a first use-case [\#8715](https://github.com/apache/arrow-rs/pull/8715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) -- feat: Parquet-layout add Index and Footer info [\#8712](https://github.com/apache/arrow-rs/pull/8712) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) -- fix: `zip` now treats nulls as false in provided mask regardless of the underlying bit value [\#8711](https://github.com/apache/arrow-rs/pull/8711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- Add benchmark for casting to RunEndEncoded \(REE\) [\#8710](https://github.com/apache/arrow-rs/pull/8710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) -- \[Minor\]: Document visibility for enums produced by Thrift macros [\#8706](https://github.com/apache/arrow-rs/pull/8706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Update `arrow-avro` `README.md` version to 57 [\#8695](https://github.com/apache/arrow-rs/pull/8695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) -- Fix: ViewType gc on huge batch would produce bad output [\#8694](https://github.com/apache/arrow-rs/pull/8694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) -- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8689](https://github.com/apache/arrow-rs/pull/8689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) -- check bit width to avoid panic in DeltaBitPackDecoder [\#8688](https://github.com/apache/arrow-rs/pull/8688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) -- \[thrift-remodel\] Use `thrift_enum` macro for `ConvertedType` [\#8680](https://github.com/apache/arrow-rs/pull/8680) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- \[JSON\] Map key supports utf8 view [\#8679](https://github.com/apache/arrow-rs/pull/8679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) -- \[JSON\] Add encoding for binary view [\#8675](https://github.com/apache/arrow-rs/pull/8675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) -- \[Parquet\] Account for FileDecryptor in ParquetMetaData heap size calculation [\#8671](https://github.com/apache/arrow-rs/pull/8671) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([adamreeve](https://github.com/adamreeve)) -- chore: update `OffsetBuffer::from_lengths(std::iter::repeat_n(, ));` with `OffsetBuffer::from_repeated_length(, );` [\#8669](https://github.com/apache/arrow-rs/pull/8669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Variant\] Support `shred_variant` for Uuids [\#8666](https://github.com/apache/arrow-rs/pull/8666) ([friendlymatthew](https://github.com/friendlymatthew)) -- \[Variant\] Remove `create_test_variant_array` helper method [\#8664](https://github.com/apache/arrow-rs/pull/8664) ([friendlymatthew](https://github.com/friendlymatthew)) -- \[parquet\] Adding counting method in thrift\_enum macro to support ENCODING\_SLOTS [\#8663](https://github.com/apache/arrow-rs/pull/8663) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) -- chore: add test case of RowSelection::trim [\#8660](https://github.com/apache/arrow-rs/pull/8660) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([lichuang](https://github.com/lichuang)) -- feat: add `new_repeated` to `ByteArray` [\#8659](https://github.com/apache/arrow-rs/pull/8659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- perf: add `repeat_slice_n_times` to `MutableBuffer` [\#8658](https://github.com/apache/arrow-rs/pull/8658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- perf: add optimized function to create offset with same length [\#8656](https://github.com/apache/arrow-rs/pull/8656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Variant\] `rescale_decimal` followup [\#8655](https://github.com/apache/arrow-rs/pull/8655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) -- feat: parse DataType `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList` [\#8649](https://github.com/apache/arrow-rs/pull/8649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) -- Support more operations on ListView [\#8645](https://github.com/apache/arrow-rs/pull/8645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([a10y](https://github.com/a10y)) -- \[Variant\] Implement primitive type access for null/time/decimal\* [\#8638](https://github.com/apache/arrow-rs/pull/8638) ([klion26](https://github.com/klion26)) -- \[Variant\] refactor: Split builder.rs into several smaller files [\#8635](https://github.com/apache/arrow-rs/pull/8635) ([Weijun-H](https://github.com/Weijun-H)) -- add `try_new_with_length` constructor to `FixedSizeList` [\#8624](https://github.com/apache/arrow-rs/pull/8624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([connortsui20](https://github.com/connortsui20)) -- Change some panics to errors in parquet decoder [\#8602](https://github.com/apache/arrow-rs/pull/8602) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) -- Support `variant_to_arrow` for utf8 [\#8600](https://github.com/apache/arrow-rs/pull/8600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sdf-jkl](https://github.com/sdf-jkl)) -- Cast support for RunEndEncoded arrays [\#8589](https://github.com/apache/arrow-rs/pull/8589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) +- Add `DataType::is_decimal` [\#9100](https://github.com/apache/arrow-rs/pull/9100) ([AdamGS](https://github.com/AdamGS)) +- feat\(parquet\): relax type compatility check in parquet ArrowWriter [\#9099](https://github.com/apache/arrow-rs/pull/9099) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([gruuya](https://github.com/gruuya)) +- \[Variant\] Move `ArrayVariantToArrowRowBuilder` to `variant_to_arrow` [\#9094](https://github.com/apache/arrow-rs/pull/9094) ([liamzwbao](https://github.com/liamzwbao)) +- chore: increase row count and batch size for more deterministic tests [\#9088](https://github.com/apache/arrow-rs/pull/9088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Fix `nullif` kernel [\#9087](https://github.com/apache/arrow-rs/pull/9087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add `FlightInfo::with_endpoints` method [\#9075](https://github.com/apache/arrow-rs/pull/9075) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([lewiszlw](https://github.com/lewiszlw)) +- chore: run validation when debug assertion enabled and not only for test [\#9073](https://github.com/apache/arrow-rs/pull/9073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Minor: make it clear cache array reader is not cloning arrays [\#9057](https://github.com/apache/arrow-rs/pull/9057) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Minor: avoid clone in RunArray row decoding via buffer stealing [\#9052](https://github.com/apache/arrow-rs/pull/9052) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lyang24](https://github.com/lyang24)) +- Minor: avoid some clones when reading parquet [\#9048](https://github.com/apache/arrow-rs/pull/9048) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- fix: don't generate nulls for `Decimal128` and `Decimal256` when field is non-nullable and have non-zero `null_density` [\#9046](https://github.com/apache/arrow-rs/pull/9046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- fix: `Rows` `size` should use `capacity` and not `len` [\#9044](https://github.com/apache/arrow-rs/pull/9044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- fix: integration / Archery test With other arrows container ran out of space [\#9043](https://github.com/apache/arrow-rs/pull/9043) ([lyang24](https://github.com/lyang24)) +- feat: add new `try_append_value_n()` function to `GenericByteViewBuilder` [\#9040](https://github.com/apache/arrow-rs/pull/9040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lyang24](https://github.com/lyang24)) +- Rename fields in BooleanBuffer for clarity [\#9039](https://github.com/apache/arrow-rs/pull/9039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Allocate buffers before work in `boolean_kernels` benchmark [\#9035](https://github.com/apache/arrow-rs/pull/9035) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Move RunArray::get\_physical\_indices to RunEndBuffer [\#9027](https://github.com/apache/arrow-rs/pull/9027) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lyang24](https://github.com/lyang24)) +- Improve `RunArray` documentation [\#9019](https://github.com/apache/arrow-rs/pull/9019) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Add BooleanArray tests for null and slice behavior [\#9013](https://github.com/apache/arrow-rs/pull/9013) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([UtkarshSahay123](https://github.com/UtkarshSahay123)) +- feat: support array indices in VariantPath dot notation [\#9012](https://github.com/apache/arrow-rs/pull/9012) ([foskey51](https://github.com/foskey51)) +- arrow-cast: Bring back in-order field casting for `StructArray` [\#9007](https://github.com/apache/arrow-rs/pull/9007) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- arrow-ipc: Add ListView support [\#9006](https://github.com/apache/arrow-rs/pull/9006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add quote style to csv writer [\#9004](https://github.com/apache/arrow-rs/pull/9004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xanderbailey](https://github.com/xanderbailey)) +- Fix row slice bug in Union column decoding with many columns [\#9000](https://github.com/apache/arrow-rs/pull/9000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- build\(deps\): bump actions/download-artifact from 6 to 7 [\#8995](https://github.com/apache/arrow-rs/pull/8995) ([dependabot[bot]](https://github.com/apps/dependabot)) +- minor: Add comment blocks to PR template [\#8994](https://github.com/apache/arrow-rs/pull/8994) ([Jefffrey](https://github.com/Jefffrey)) +- Implement `BinaryArrayType` for `&FixedSizeBinaryArray`s [\#8993](https://github.com/apache/arrow-rs/pull/8993) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- feat: impl BatchCoalescer::push\_batch\_with\_indices [\#8991](https://github.com/apache/arrow-rs/pull/8991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ClSlaid](https://github.com/ClSlaid)) +- \[Arrow\]Configure max deduplication length for `StringView` [\#8990](https://github.com/apache/arrow-rs/pull/8990) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lichuang](https://github.com/lichuang)) +- feat: implement append\_array for FixedSizeBinaryBuilder [\#8989](https://github.com/apache/arrow-rs/pull/8989) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ClSlaid](https://github.com/ClSlaid)) +- Add benchmarks for Utf8View scalars for zip [\#8988](https://github.com/apache/arrow-rs/pull/8988) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mkleen](https://github.com/mkleen)) +- build\(deps\): bump actions/cache from 4 to 5 [\#8986](https://github.com/apache/arrow-rs/pull/8986) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Take fsb null indices [\#8981](https://github.com/apache/arrow-rs/pull/8981) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add List to `interleave_kernels` benchmark [\#8980](https://github.com/apache/arrow-rs/pull/8980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix ipc errors for `LargeList` containing sliced `StringViews` [\#8979](https://github.com/apache/arrow-rs/pull/8979) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fabianmurariu](https://github.com/fabianmurariu)) +- arrow-buffer: implement num-traits numeric operations [\#8977](https://github.com/apache/arrow-rs/pull/8977) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([theirix](https://github.com/theirix)) +- Update `xz` crate dependency to use `liblzma` in arrow-avro [\#8975](https://github.com/apache/arrow-rs/pull/8975) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) +- arrow-data: avoid allocating in get\_last\_run\_end [\#8974](https://github.com/apache/arrow-rs/pull/8974) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asubiotto](https://github.com/asubiotto)) +- Support for `Arc` in `ParquetRecordWriter` derive macro [\#8973](https://github.com/apache/arrow-rs/pull/8973) ([heilhead](https://github.com/heilhead)) +- feat: support casting `Time32` to `Int64` [\#8971](https://github.com/apache/arrow-rs/pull/8971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tshauck](https://github.com/tshauck)) +- arrow-buffer: add i256::trailing\_zeros [\#8969](https://github.com/apache/arrow-rs/pull/8969) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([theirix](https://github.com/theirix)) +- Perf: Vectorize check\_bounds\(2x speedup\) [\#8966](https://github.com/apache/arrow-rs/pull/8966) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gstvg](https://github.com/gstvg)) +- arrow-buffer: make i256::leading\_zeros public and tested [\#8964](https://github.com/apache/arrow-rs/pull/8964) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([theirix](https://github.com/theirix)) +- Add ignore leading and trailing white space to csv parser [\#8960](https://github.com/apache/arrow-rs/pull/8960) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xanderbailey](https://github.com/xanderbailey)) +- Access `UnionFields` elements by index [\#8959](https://github.com/apache/arrow-rs/pull/8959) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add Parquet roundtrip benchmarks [\#8956](https://github.com/apache/arrow-rs/pull/8956) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- \[Variant\] Add variant to arrow for Date64/Timestamp\(Second/Millisecond\)/Time32/Time64 [\#8950](https://github.com/apache/arrow-rs/pull/8950) ([klion26](https://github.com/klion26)) +- Let `ArrowArrayStreamReader` handle schema with attached metadata + do schema checking [\#8944](https://github.com/apache/arrow-rs/pull/8944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonded94](https://github.com/jonded94)) +- Adds ExtensionType for Parquet geospatial WKB arrays [\#8943](https://github.com/apache/arrow-rs/pull/8943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([BlakeOrth](https://github.com/BlakeOrth)) +- Add builder to help create Schemas for shredding \(`ShreddedSchemaBuilder`\) [\#8940](https://github.com/apache/arrow-rs/pull/8940) ([XiangpengHao](https://github.com/XiangpengHao)) +- build\(deps\): update criterion requirement from 0.7.0 to 0.8.0 [\#8939](https://github.com/apache/arrow-rs/pull/8939) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: Resolve Avro RecordEncoder bugs related to nullable Struct fields and Union type ids [\#8935](https://github.com/apache/arrow-rs/pull/8935) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) +- Some panic!s could more semantically be unimplemented! [\#8933](https://github.com/apache/arrow-rs/pull/8933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([abacef](https://github.com/abacef)) +- fix: ipc decode panic with invalid data [\#8931](https://github.com/apache/arrow-rs/pull/8931) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([leiysky](https://github.com/leiysky)) +- Allow creating zero-sized FixedSizeBinary arrays [\#8927](https://github.com/apache/arrow-rs/pull/8927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- Update `test_variant_get_error_when_cast_failure...` tests to uses a valid `VariantArray` [\#8921](https://github.com/apache/arrow-rs/pull/8921) ([alamb](https://github.com/alamb)) +- Make flight sql client generic [\#8915](https://github.com/apache/arrow-rs/pull/8915) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([lewiszlw](https://github.com/lewiszlw)) +- \[minor\] Name Magic Number "8" in `FixedSizeBinaryArray::new_null` [\#8914](https://github.com/apache/arrow-rs/pull/8914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- fix: cast Binary/String dictionary to view [\#8912](https://github.com/apache/arrow-rs/pull/8912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- \[8910\]Fixed doc test with feature prettyprint [\#8911](https://github.com/apache/arrow-rs/pull/8911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([manishkr](https://github.com/manishkr)) +- feat: `ArrayData::new_null` for `ListView` / `LargeListView` [\#8909](https://github.com/apache/arrow-rs/pull/8909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- fead: add `GenericListViewArray::from_iter_primitive` [\#8907](https://github.com/apache/arrow-rs/pull/8907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- fix: `GenericListViewArray::new_null` returns empty array [\#8905](https://github.com/apache/arrow-rs/pull/8905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- Allocate a zeroed buffer for FixedSizeBinaryArray::null [\#8901](https://github.com/apache/arrow-rs/pull/8901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- build\(deps\): bump actions/checkout from 5 to 6 [\#8899](https://github.com/apache/arrow-rs/pull/8899) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add getters to `UnionFields` [\#8895](https://github.com/apache/arrow-rs/pull/8895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add validated constructors for UnionFields [\#8891](https://github.com/apache/arrow-rs/pull/8891) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add bit width check [\#8888](https://github.com/apache/arrow-rs/pull/8888) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) +- \[Variant\] Improve `variant_get` performance on a perfect shredding [\#8887](https://github.com/apache/arrow-rs/pull/8887) ([XiangpengHao](https://github.com/XiangpengHao)) +- Add UnionArray::fields [\#8884](https://github.com/apache/arrow-rs/pull/8884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Struct casting field order [\#8871](https://github.com/apache/arrow-rs/pull/8871) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add support for `Union` types in `RowConverter` [\#8839](https://github.com/apache/arrow-rs/pull/8839) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add comparison support for Union arrays [\#8838](https://github.com/apache/arrow-rs/pull/8838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- \[Variant\] Support array shredding into `List/LargeList/ListView/LargeListView` [\#8831](https://github.com/apache/arrow-rs/pull/8831) ([liamzwbao](https://github.com/liamzwbao)) +- Add support for using ListView arrays and types through FFI [\#8822](https://github.com/apache/arrow-rs/pull/8822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([AdamGS](https://github.com/AdamGS)) +- Add ability to skip or transform page encoding statistics in Parquet metadata [\#8797](https://github.com/apache/arrow-rs/pull/8797) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Implement a `Vec` wrapper for `pyarrow.Table` convenience [\#8790](https://github.com/apache/arrow-rs/pull/8790) ([jonded94](https://github.com/jonded94)) +- Make Parquet SBBF serialize/deserialize helpers public for external reuse [\#8762](https://github.com/apache/arrow-rs/pull/8762) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([RoseZhang123](https://github.com/RoseZhang123)) +- Add cast support for \(Large\)ListView \<-\> \(Large\)List [\#8735](https://github.com/apache/arrow-rs/pull/8735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) diff --git a/Cargo.toml b/Cargo.toml index a1641d904b67..e4f1780d2914 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,7 +68,7 @@ exclude = [ ] [workspace.package] -version = "57.1.0" +version = "57.2.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -85,26 +85,26 @@ edition = "2024" rust-version = "1.85" [workspace.dependencies] -arrow = { version = "57.1.0", path = "./arrow", default-features = false } -arrow-arith = { version = "57.1.0", path = "./arrow-arith" } -arrow-array = { version = "57.1.0", path = "./arrow-array" } -arrow-buffer = { version = "57.1.0", path = "./arrow-buffer" } -arrow-cast = { version = "57.1.0", path = "./arrow-cast" } -arrow-csv = { version = "57.1.0", path = "./arrow-csv" } -arrow-data = { version = "57.1.0", path = "./arrow-data" } -arrow-ipc = { version = "57.1.0", path = "./arrow-ipc" } -arrow-json = { version = "57.1.0", path = "./arrow-json" } -arrow-ord = { version = "57.1.0", path = "./arrow-ord" } -arrow-pyarrow = { version = "57.1.0", path = "./arrow-pyarrow" } -arrow-row = { version = "57.1.0", path = "./arrow-row" } -arrow-schema = { version = "57.1.0", path = "./arrow-schema" } -arrow-select = { version = "57.1.0", path = "./arrow-select" } -arrow-string = { version = "57.1.0", path = "./arrow-string" } -parquet = { version = "57.1.0", path = "./parquet", default-features = false } -parquet-geospatial = { version = "57.1.0", path = "./parquet-geospatial" } -parquet-variant = { version = "57.1.0", path = "./parquet-variant" } -parquet-variant-json = { version = "57.1.0", path = "./parquet-variant-json" } -parquet-variant-compute = { version = "57.1.0", path = "./parquet-variant-compute" } +arrow = { version = "57.2.0", path = "./arrow", default-features = false } +arrow-arith = { version = "57.2.0", path = "./arrow-arith" } +arrow-array = { version = "57.2.0", path = "./arrow-array" } +arrow-buffer = { version = "57.2.0", path = "./arrow-buffer" } +arrow-cast = { version = "57.2.0", path = "./arrow-cast" } +arrow-csv = { version = "57.2.0", path = "./arrow-csv" } +arrow-data = { version = "57.2.0", path = "./arrow-data" } +arrow-ipc = { version = "57.2.0", path = "./arrow-ipc" } +arrow-json = { version = "57.2.0", path = "./arrow-json" } +arrow-ord = { version = "57.2.0", path = "./arrow-ord" } +arrow-pyarrow = { version = "57.2.0", path = "./arrow-pyarrow" } +arrow-row = { version = "57.2.0", path = "./arrow-row" } +arrow-schema = { version = "57.2.0", path = "./arrow-schema" } +arrow-select = { version = "57.2.0", path = "./arrow-select" } +arrow-string = { version = "57.2.0", path = "./arrow-string" } +parquet = { version = "57.2.0", path = "./parquet", default-features = false } +parquet-geospatial = { version = "57.2.0", path = "./parquet-geospatial" } +parquet-variant = { version = "57.2.0", path = "./parquet-variant" } +parquet-variant-json = { version = "57.2.0", path = "./parquet-variant-json" } +parquet-variant-compute = { version = "57.2.0", path = "./parquet-variant-compute" } chrono = { version = "0.4.40", default-features = false, features = ["clock"] } diff --git a/README.md b/README.md index 56921f382860..7726fc4c0703 100644 --- a/README.md +++ b/README.md @@ -65,15 +65,17 @@ Planned Release Schedule | Approximate Date | Version | Notes | | ---------------- | ---------- | --------------------------------------- | -| October 2025 | [`57.0.0`] | Major, potentially breaking API changes | -| November 2025 | [`57.1.0`] | Minor, NO breaking API changes | | December 2025 | [`57.2.0`] | Minor, NO breaking API changes | | January 2026 | [`58.0.0`] | Major, potentially breaking API changes | +| February 2026 | [`58.1.0`] | Minor, NO breaking API changes | +| March 2026 | [`58.2.0`] | Minor, NO breaking API changes | +| April 2026 | [`59.0.0`] | Major, potentially breaking API changes | -[`57.0.0`]: https://github.com/apache/arrow-rs/issues/7835 -[`57.1.0`]: https://github.com/apache/arrow-rs/milestone/3 [`57.2.0`]: https://github.com/apache/arrow-rs/milestone/5 [`58.0.0`]: https://github.com/apache/arrow-rs/milestone/6 +[`58.1.0`]: https://github.com/apache/arrow-rs/issues/9108 +[`58.2.0`]: https://github.com/apache/arrow-rs/issues/9109 +[`59.0.0`]: https://github.com/apache/arrow-rs/issues/9110 [ticket #5368]: https://github.com/apache/arrow-rs/issues/5368 [semantic versioning]: https://semver.org/ diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 91623bc22b92..a043259694c1 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -332,10 +332,10 @@ fn aggregate, A: Numeric /// Returns the minimum value in the boolean array. /// +/// # Example /// ``` /// # use arrow_array::BooleanArray; /// # use arrow_arith::aggregate::min_boolean; -/// /// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); /// assert_eq!(min_boolean(&a), Some(false)) /// ``` @@ -390,10 +390,10 @@ pub fn min_boolean(array: &BooleanArray) -> Option { /// Returns the maximum value in the boolean array /// +/// # Example /// ``` /// # use arrow_array::BooleanArray; /// # use arrow_arith::aggregate::max_boolean; -/// /// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); /// assert_eq!(max_boolean(&a), Some(true)) /// ``` @@ -809,6 +809,15 @@ where /// Returns the minimum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value +/// +/// # Example +/// ```rust +/// # use arrow_array::Int32Array; +/// # use arrow_arith::aggregate::min; +/// let array = Int32Array::from(vec![8, 2, 4]); +/// let result = min(&array); +/// assert_eq!(result, Some(2)); +/// ``` pub fn min(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, @@ -818,6 +827,15 @@ where /// Returns the maximum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value +/// +/// # Example +/// ```rust +/// # use arrow_array::Int32Array; +/// # use arrow_arith::aggregate::max; +/// let array = Int32Array::from(vec![4, 8, 2]); +/// let result = max(&array); +/// assert_eq!(result, Some(8)); +/// ``` pub fn max(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index d94df49de256..6bf438e64618 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -23,7 +23,7 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. use arrow_array::*; -use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; +use arrow_buffer::buffer::bitwise_quaternary_op_helper; use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not}; use arrow_schema::ArrowError; @@ -74,7 +74,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_null_buffer.buffer(), right_null_buffer.offset(), left_values.inner(), @@ -100,7 +100,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_nulls.buffer(), right_nulls.offset(), left_values.inner(), @@ -195,7 +196,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result &dyn Any { self diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index fbd8458846fc..bd85bffcfe44 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -462,6 +462,8 @@ impl std::fmt::Debug for GenericByteArray { } } +impl super::private::Sealed for GenericByteArray {} + impl Array for GenericByteArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 77ef229068dd..09f0f56ba3ac 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -865,6 +865,8 @@ impl Debug for GenericByteViewArray { } } +impl super::private::Sealed for GenericByteViewArray {} + impl Array for GenericByteViewArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 5243218392f6..be7703b13c5c 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -697,6 +697,8 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray } } +impl super::private::Sealed for DictionaryArray {} + impl Array for DictionaryArray { fn as_any(&self) -> &dyn Any { self @@ -856,6 +858,8 @@ impl<'a, K: ArrowDictionaryKeyType, V> TypedDictionaryArray<'a, K, V> { } } +impl super::private::Sealed for TypedDictionaryArray<'_, K, V> {} + impl Array for TypedDictionaryArray<'_, K, V> { fn as_any(&self) -> &dyn Any { self.dictionary diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index d13cecb18027..b94e168cfe7c 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -602,6 +602,8 @@ impl std::fmt::Debug for FixedSizeBinaryArray { } } +impl super::private::Sealed for FixedSizeBinaryArray {} + impl Array for FixedSizeBinaryArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index fca92a64812c..3d5e8a0787c2 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -462,6 +462,8 @@ impl From for ArrayData { } } +impl super::private::Sealed for FixedSizeListArray {} + impl Array for FixedSizeListArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 32add1abf557..225be14ae365 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -525,6 +525,8 @@ impl GenericListArray { } } +impl super::private::Sealed for GenericListArray {} + impl Array for GenericListArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 867dcf955be7..52c88d581d20 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -415,6 +415,8 @@ impl ArrayAccessor for &GenericListViewArray super::private::Sealed for GenericListViewArray {} + impl Array for GenericListViewArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index b5e611a92b57..86608d586f34 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -361,6 +361,8 @@ impl MapArray { } } +impl super::private::Sealed for MapArray {} + impl Array for MapArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index bb114be95045..75e32d57e89c 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -78,8 +78,18 @@ pub use list_view_array::*; use crate::iterator::ArrayIter; +mod private { + /// Private marker trait to ensure [`super::Array`] can not be implemented outside this crate + pub trait Sealed {} + + impl Sealed for &T {} +} + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) -pub trait Array: std::fmt::Debug + Send + Sync { +/// +/// This trait is sealed as it is not intended for custom array types, rather only +/// those defined in this crate. +pub trait Array: std::fmt::Debug + Send + Sync + private::Sealed { /// Returns the array as [`Any`] so that it can be /// downcasted to a specific implementation. /// @@ -341,6 +351,8 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// A reference-counted reference to a generic `Array` pub type ArrayRef = Arc; +impl private::Sealed for ArrayRef {} + /// Ergonomics: Allow use of an ArrayRef as an `&dyn Array` impl Array for ArrayRef { fn as_any(&self) -> &dyn Any { diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 72556a92a3bc..b682466b6738 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -76,6 +76,8 @@ impl NullArray { } } +impl super::private::Sealed for NullArray {} + impl Array for NullArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index e71f4d47193f..457c2428145e 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1190,6 +1190,8 @@ impl From> for ArrayData { } } +impl super::private::Sealed for PrimitiveArray {} + impl Array for PrimitiveArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index ddc99f8e172d..5254a0ed3cdc 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -260,6 +260,8 @@ impl From> for ArrayData { } } +impl super::private::Sealed for RunArray {} + impl Array for RunArray { fn as_any(&self) -> &dyn Any { self @@ -519,6 +521,8 @@ impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { } } +impl super::private::Sealed for TypedRunArray<'_, R, V> {} + impl Array for TypedRunArray<'_, R, V> { fn as_any(&self) -> &dyn Any { self.run_array diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 5b18bd35d026..6ad1ead0d250 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -401,6 +401,8 @@ impl TryFrom> for StructArray { } } +impl super::private::Sealed for StructArray {} + impl Array for StructArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 934107d075f7..e08542bc8638 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -738,6 +738,8 @@ impl From for ArrayData { } } +impl super::private::Sealed for UnionArray {} + impl Array for UnionArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 548401ed4201..f7f1b025ed69 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -165,12 +165,14 @@ impl BooleanBuffer { /// * `op` must only apply bitwise operations /// on the relevant bits; the input `u64` may contain irrelevant bits /// and may be processed differently on different endian architectures. + /// * `op` may be called with input bits outside the requested range /// * The output always has zero offset /// /// # See Also + /// - [`BooleanBuffer::from_bitwise_binary_op`] to create a new buffer from a binary operation /// - [`apply_bitwise_unary_op`](bit_util::apply_bitwise_unary_op) for in-place unary bitwise operations /// - /// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of an input [`Buffer`] + /// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of a byte slice /// ``` /// # use arrow_buffer::BooleanBuffer; /// let input = [0b11001100u8, 0b10111010u8]; // 2 bytes = 16 bits @@ -220,9 +222,8 @@ impl BooleanBuffer { result.truncate(chunks.num_bytes()); } - let buffer = Buffer::from(result); BooleanBuffer { - buffer, + buffer: Buffer::from(result), bit_offset: 0, bit_len: len_in_bits, } @@ -253,6 +254,112 @@ impl BooleanBuffer { Some(BooleanBuffer::new(buffer, 0, len_in_bits)) } + /// Create a new [`BooleanBuffer`] by applying the bitwise operation `op` to + /// the relevant bits from two input buffers. + /// + /// This function is faster than applying the operation bit by bit as + /// it processes input buffers in chunks of 64 bits (8 bytes) at a time + /// + /// # Notes: + /// See notes on [Self::from_bitwise_unary_op] + /// + /// # See Also + /// - [`BooleanBuffer::from_bitwise_unary_op`] for unary operations on a single input buffer. + /// - [`apply_bitwise_binary_op`](bit_util::apply_bitwise_binary_op) for in-place binary bitwise operations + /// + /// # Example: Create new [`BooleanBuffer`] from bitwise `AND` of two [`Buffer`]s + /// ``` + /// # use arrow_buffer::{Buffer, BooleanBuffer}; + /// let left = Buffer::from(vec![0b11001100u8, 0b10111010u8]); // 2 bytes = 16 bits + /// let right = Buffer::from(vec![0b10101010u8, 0b11011100u8, 0b11110000u8]); // 3 bytes = 24 bits + /// // AND of the first 12 bits + /// let result = BooleanBuffer::from_bitwise_binary_op( + /// &left, 0, &right, 0, 12, |a, b| a & b + /// ); + /// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]); + /// ``` + /// + /// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices + /// ``` + /// # use arrow_buffer::BooleanBuffer; + /// let left = [0b11001100u8, 0b10111010u8]; + /// let right = [0b10101010u8, 0b11011100u8]; + /// // OR of bits 4..16 from left and bits 0..12 from right + /// let result = BooleanBuffer::from_bitwise_binary_op( + /// &left, 4, &right, 0, 12, |a, b| a | b + /// ); + /// assert_eq!(result.inner().as_slice(), &[0b10101110u8, 0b00001111u8]); + /// ``` + pub fn from_bitwise_binary_op( + left: impl AsRef<[u8]>, + left_offset_in_bits: usize, + right: impl AsRef<[u8]>, + right_offset_in_bits: usize, + len_in_bits: usize, + mut op: F, + ) -> Self + where + F: FnMut(u64, u64) -> u64, + { + let left = left.as_ref(); + let right = right.as_ref(); + // try fast path for aligned input + // If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices + // to improve performance. + if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { + // align to byte boundary + let left = &left[left_offset_in_bits / 8..]; + let right = &right[right_offset_in_bits / 8..]; + + unsafe { + let (left_prefix, left_u64s, left_suffix) = left.align_to::(); + let (right_prefix, right_u64s, right_suffix) = right.align_to::(); + // if there is no prefix or suffix, both buffers are aligned and + // we can do the operation directly on u64s. + // TODO: consider `slice::as_chunks` and `u64::from_le_bytes` when MSRV reaches 1.88. + // https://github.com/apache/arrow-rs/pull/9022#discussion_r2639949361 + if left_prefix.is_empty() + && right_prefix.is_empty() + && left_suffix.is_empty() + && right_suffix.is_empty() + { + let result_u64s = left_u64s + .iter() + .zip(right_u64s.iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + return BooleanBuffer { + buffer: Buffer::from(result_u64s), + bit_offset: 0, + bit_len: len_in_bits, + }; + } + } + } + let left_chunks = BitChunks::new(left, left_offset_in_bits, len_in_bits); + let right_chunks = BitChunks::new(right, right_offset_in_bits, len_in_bits); + + let chunks = left_chunks + .iter() + .zip(right_chunks.iter()) + .map(|(left, right)| op(left, right)); + // Soundness: `BitChunks` is a `BitChunks` trusted length iterator which + // correctly reports its upper bound + let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; + + let remainder_bytes = bit_util::ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + buffer.extend_from_slice(rem); + + BooleanBuffer { + buffer: Buffer::from(buffer), + bit_offset: 0, + bit_len: len_in_bits, + } + } + /// Returns the number of set bits in this buffer pub fn count_set_bits(&self) -> usize { self.buffer @@ -655,4 +762,42 @@ mod tests { assert_eq!(result, expected); } } + + #[test] + fn test_from_bitwise_binary_op() { + // pick random boolean inputs + let input_bools_left = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_bools_right = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]); + let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]); + + for left_offset in 0..200 { + for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] { + for len_offset in [0, 1, 44, 100, 256, 300, 512] { + let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds + // compute with AND + let result = BooleanBuffer::from_bitwise_binary_op( + input_buffer_left.values(), + left_offset, + input_buffer_right.values(), + right_offset, + len, + |a, b| a & b, + ); + // compute directly from bools + let expected = input_bools_left[left_offset..] + .iter() + .zip(&input_bools_right[right_offset..]) + .take(len) + .map(|(a, b)| *a & *b) + .collect::(); + assert_eq!(result, expected); + } + } + } + } } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index 05593504b1cf..36efe876432d 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -20,7 +20,12 @@ use crate::BooleanBuffer; use crate::util::bit_util::ceil; /// Apply a bitwise operation `op` to four inputs and return the result as a Buffer. -/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +/// +/// The inputs are treated as bitmaps, meaning that offsets and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_quaternary_op_helper( buffers: [&Buffer; 4], offsets: [usize; 4], @@ -60,7 +65,12 @@ where } /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. -/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +/// +/// The inputs are treated as bitmaps, meaning that offsets and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_bin_op_helper( left: &Buffer, left_offset_in_bits: usize, @@ -93,21 +103,42 @@ where } /// Apply a bitwise operation `op` to one input and return the result as a Buffer. -/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. -#[deprecated( - since = "57.2.0", - note = "use BooleanBuffer::from_bitwise_unary_op instead" -)] +/// +/// The input is treated as a bitmap, meaning that offset and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_unary_op_helper( left: &Buffer, offset_in_bits: usize, len_in_bits: usize, - op: F, + mut op: F, ) -> Buffer where F: FnMut(u64) -> u64, { - BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, op).into_inner() + // reserve capacity and set length so we can get a typed view of u64 chunks + let mut result = + MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false); + + let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits); + + let result_chunks = result.typed_data_mut::().iter_mut(); + + result_chunks + .zip(left_chunks.iter()) + .for_each(|(res, left)| { + *res = op(left); + }); + + let remainder_bytes = ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + result.extend_from_slice(rem); + + result.into() } /// Apply a bitwise and to two inputs and return the result as a Buffer. @@ -119,7 +150,7 @@ pub fn buffer_bin_and( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -127,6 +158,7 @@ pub fn buffer_bin_and( len_in_bits, |a, b| a & b, ) + .into_inner() } /// Apply a bitwise or to two inputs and return the result as a Buffer. @@ -138,7 +170,7 @@ pub fn buffer_bin_or( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -146,6 +178,7 @@ pub fn buffer_bin_or( len_in_bits, |a, b| a | b, ) + .into_inner() } /// Apply a bitwise xor to two inputs and return the result as a Buffer. @@ -157,7 +190,7 @@ pub fn buffer_bin_xor( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -165,6 +198,7 @@ pub fn buffer_bin_xor( len_in_bits, |a, b| a ^ b, ) + .into_inner() } /// Apply a bitwise and_not to two inputs and return the result as a Buffer. @@ -176,7 +210,7 @@ pub fn buffer_bin_and_not( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -184,11 +218,11 @@ pub fn buffer_bin_and_not( len_in_bits, |a, b| a & !b, ) + .into_inner() } /// Apply a bitwise not to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer { - // TODO: should we deprecate this function in favor of the Buffer ! impl ? BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, |a| !a).into_inner() } diff --git a/arrow-buffer/src/builder/boolean.rs b/arrow-buffer/src/builder/boolean.rs index 512f729fda3e..41a75ef3e2c1 100644 --- a/arrow-buffer/src/builder/boolean.rs +++ b/arrow-buffer/src/builder/boolean.rs @@ -140,7 +140,6 @@ impl BooleanBufferBuilder { /// Reserve space to at least `additional` new bits. /// Capacity will be `>= self.len() + additional`. - /// New bytes are uninitialized and reading them is undefined behavior. #[inline] pub fn reserve(&mut self, additional: usize) { let capacity = self.len + additional; diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 99f2a2ed462b..db900341560c 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -603,6 +603,12 @@ impl FlightInfo { self } + /// Add endpoints for fetching all data + pub fn with_endpoints(mut self, endpoints: Vec) -> Self { + self.endpoint = endpoints; + self + } + /// Add a [`FlightDescriptor`] describing what this data is pub fn with_descriptor(mut self, flight_descriptor: FlightDescriptor) -> Self { self.flight_descriptor = Some(flight_descriptor); diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index f2653ec4e46e..5fcde480eb6d 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -65,3 +65,7 @@ rand = { version = "0.9", default-features = false, features = ["std", "std_rng" [[bench]] name = "serde" harness = false + +[[bench]] +name = "json-reader" +harness = false diff --git a/arrow-json/benches/json-reader.rs b/arrow-json/benches/json-reader.rs new file mode 100644 index 000000000000..504839f8ffe2 --- /dev/null +++ b/arrow-json/benches/json-reader.rs @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::ReaderBuilder; +use arrow_json::reader::Decoder; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{ + BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use serde_json::{Map, Number, Value}; +use std::fmt::Write; +use std::hint::black_box; +use std::sync::Arc; + +const ROWS: usize = 1 << 17; // 128K rows +const BATCH_SIZE: usize = 1 << 13; // 8K rows per batch + +const WIDE_FIELDS: usize = 64; +const BINARY_BYTES: usize = 64; +const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3 + +fn decode_and_flush(decoder: &mut Decoder, data: &[u8]) { + let mut offset = 0; + while offset < data.len() { + let read = decoder.decode(black_box(&data[offset..])).unwrap(); + if read == 0 { + break; + } + offset += read; + while let Some(_batch) = decoder.flush().unwrap() {} + } +} + +fn build_schema(field_count: usize) -> Arc { + // Builds a schema with fields named f0..f{field_count-1}, all Int64 and non-nullable. + let fields: Vec = (0..field_count) + .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) + .collect(); + Arc::new(Schema::new(fields)) +} + +fn build_projection_schema(indices: &[usize]) -> Arc { + let fields: Vec = indices + .iter() + .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) + .collect(); + Arc::new(Schema::new(fields)) +} + +fn build_wide_json(rows: usize, fields: usize) -> Vec { + // Builds newline-delimited JSON objects with "wide" schema. + // Example (rows=2, fields=3): + // {"f0":0,"f1":1,"f2":2} + // {"f0":1,"f1":2,"f2":3} + let mut out = String::with_capacity(rows * fields * 12); + for row in 0..rows { + out.push('{'); + for field in 0..fields { + if field > 0 { + out.push(','); + } + let value = row as i64 + field as i64; + write!(&mut out, "\"f{field}\":{value}").unwrap(); + } + out.push('}'); + out.push('\n'); + } + out.into_bytes() +} + +fn build_wide_values(rows: usize, fields: usize) -> Vec { + // Mirrors build_wide_json but returns structured serde_json::Value objects. + let mut out = Vec::with_capacity(rows); + for row in 0..rows { + let mut map = Map::with_capacity(fields); + for field in 0..fields { + let key = format!("f{field}"); + let value = Number::from((row + field) as i64); + map.insert(key, Value::Number(value)); + } + out.push(Value::Object(map)); + } + out +} + +fn bench_decode_wide_object(c: &mut Criterion) { + let data = build_wide_json(ROWS, WIDE_FIELDS); + let schema = build_schema(WIDE_FIELDS); + + c.bench_function("decode_wide_object_i64_json", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, &data); + }) + }); +} + +fn bench_serialize_wide_object(c: &mut Criterion) { + let values = build_wide_values(ROWS, WIDE_FIELDS); + let schema = build_schema(WIDE_FIELDS); + + c.bench_function("decode_wide_object_i64_serialize", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + + decoder.serialize(&values).unwrap(); + while let Some(_batch) = decoder.flush().unwrap() {} + }) + }); +} + +fn bench_decode_binary(c: &mut Criterion, name: &str, data: &[u8], field: Arc) { + c.bench_function(name, |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new_with_field(field.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, data); + }) + }); +} + +#[inline] +fn append_hex_byte(buf: &mut String, byte: u8) { + const HEX: &[u8; 16] = b"0123456789abcdef"; + buf.push(HEX[(byte >> 4) as usize] as char); + buf.push(HEX[(byte & 0x0f) as usize] as char); +} + +fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { + let mut data = String::with_capacity(rows * (bytes_per_row * 2 + 3)); + for row in 0..rows { + data.push('"'); + for i in 0..bytes_per_row { + let byte = ((row + i) & 0xff) as u8; + append_hex_byte(&mut data, byte); + } + data.push('"'); + data.push('\n'); + } + data.into_bytes() +} + +fn bench_binary_hex(c: &mut Criterion) { + let binary_data = build_hex_lines(ROWS, BINARY_BYTES); + + let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); + bench_decode_binary(c, "decode_binary_hex_json", &binary_data, binary_field); + + let fixed_field = Arc::new(Field::new( + "item", + DataType::FixedSizeBinary(BINARY_BYTES as i32), + false, + )); + bench_decode_binary(c, "decode_fixed_binary_hex_json", &binary_data, fixed_field); + + let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); + bench_decode_binary(c, "decode_binary_view_hex_json", &binary_data, view_field); +} + +fn bench_decode_schema(c: &mut Criterion, name: &str, data: &[u8], schema: Arc) { + let mut group = c.benchmark_group(name); + group.throughput(Throughput::Bytes(data.len() as u64)); + group.sample_size(50); + group.measurement_time(std::time::Duration::from_secs(5)); + group.warm_up_time(std::time::Duration::from_secs(2)); + group.sampling_mode(SamplingMode::Flat); + group.bench_function(BenchmarkId::from_parameter(ROWS), |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, data); + }) + }); + group.finish(); +} + +fn build_wide_projection_json(rows: usize, total_fields: usize) -> Vec { + // Estimate: each field ~15 bytes ("fXX":VVVVVVV,), total ~15*100 + overhead + let per_row_size = total_fields * 15 + 10; + let mut data = String::with_capacity(rows * per_row_size); + + for _row in 0..rows { + data.push('{'); + for i in 0..total_fields { + if i > 0 { + data.push(','); + } + // Use fixed-width values for stable benchmarks: 7 digits + let _ = write!(data, "\"f{}\":{:07}", i, i); + } + data.push('}'); + data.push('\n'); + } + data.into_bytes() +} + +fn bench_wide_projection(c: &mut Criterion) { + // Wide projection workload: tests overhead of parsing unused fields + let wide_projection_data = build_wide_projection_json(ROWS, WIDE_PROJECTION_TOTAL_FIELDS); + + let full_schema = build_schema(WIDE_PROJECTION_TOTAL_FIELDS); + bench_decode_schema( + c, + "decode_wide_projection_full_json", + &wide_projection_data, + full_schema, + ); + + // Projected schema: only 3 fields (f0, f10, f50) out of 100 + let projected_schema = build_projection_schema(&[0, 10, 50]); + bench_decode_schema( + c, + "decode_wide_projection_narrow_json", + &wide_projection_data, + projected_schema, + ); +} + +criterion_group!( + benches, + bench_decode_wide_object, + bench_serialize_wide_object, + bench_binary_hex, + bench_wide_projection +); +criterion_main!(benches); diff --git a/arrow-json/benches/serde.rs b/arrow-json/benches/serde.rs index 23f005cc90ab..282f2e7c76d0 100644 --- a/arrow-json/benches/serde.rs +++ b/arrow-json/benches/serde.rs @@ -22,12 +22,14 @@ use rand::{Rng, rng}; use serde::Serialize; use std::sync::Arc; +const ROWS: usize = 1 << 18; + #[allow(deprecated)] fn do_bench(c: &mut Criterion, name: &str, rows: &[R], schema: &Schema) { let schema = Arc::new(schema.clone()); c.bench_function(name, |b| { b.iter(|| { - let builder = ReaderBuilder::new(schema.clone()).with_batch_size(64); + let builder = ReaderBuilder::new(schema.clone()).with_batch_size(8192); let mut decoder = builder.build_decoder().unwrap(); decoder.serialize(rows) }) @@ -37,26 +39,26 @@ fn do_bench(c: &mut Criterion, name: &str, rows: &[R], schema: &Sc fn criterion_benchmark(c: &mut Criterion) { let mut rng = rng(); let schema = Schema::new(vec![Field::new("i32", DataType::Int32, false)]); - let v: Vec = (0..2048).map(|_| rng.random_range(0..10000)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0..10000)).collect(); do_bench(c, "small_i32", &v, &schema); - let v: Vec = (0..2048).map(|_| rng.random()).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random()).collect(); do_bench(c, "large_i32", &v, &schema); let schema = Schema::new(vec![Field::new("i64", DataType::Int64, false)]); - let v: Vec = (0..2048).map(|_| rng.random_range(0..10000)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0..10000)).collect(); do_bench(c, "small_i64", &v, &schema); - let v: Vec = (0..2048) + let v: Vec = (0..ROWS) .map(|_| rng.random_range(0..i32::MAX as _)) .collect(); do_bench(c, "medium_i64", &v, &schema); - let v: Vec = (0..2048).map(|_| rng.random()).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random()).collect(); do_bench(c, "large_i64", &v, &schema); let schema = Schema::new(vec![Field::new("f32", DataType::Float32, false)]); - let v: Vec = (0..2048).map(|_| rng.random_range(0.0..10000.)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0.0..10000.)).collect(); do_bench(c, "small_f32", &v, &schema); - let v: Vec = (0..2048).map(|_| rng.random_range(0.0..f32::MAX)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0.0..f32::MAX)).collect(); do_bench(c, "large_f32", &v, &schema); } diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index fdedbbcae930..39d56f8fe9b2 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -846,7 +846,7 @@ pub struct SortColumn { /// Returns an `ArrowError::ComputeError(String)` if any of the array type is either unsupported by /// `lexsort_to_indices` or `take`. /// -/// Example: +/// # Example: /// /// ``` /// # use std::convert::From; @@ -855,7 +855,6 @@ pub struct SortColumn { /// # use arrow_array::types::Int64Type; /// # use arrow_array::cast::AsArray; /// # use arrow_ord::sort::{SortColumn, SortOptions, lexsort}; -/// /// let sorted_columns = lexsort(&vec![ /// SortColumn { /// values: Arc::new(PrimitiveArray::::from(vec![ diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 72a295627ed2..4cafbc2748ee 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -415,6 +415,41 @@ mod variable; /// ///``` /// +/// ## Union Encoding +/// +/// A union value is encoded as a single type-id byte followed by the row encoding of the selected child value. +/// The type-id byte is always present; union arrays have no top-level null marker, so nulls are represented by the child encoding. +/// +/// For example, given a union of Int32 (type_id = 0) and Utf8 (type_id = 1): +/// +/// ```text +/// ┌──┬──────────────┐ +/// 3 │00│01│80│00│00│03│ +/// └──┴──────────────┘ +/// │ └─ signed integer encoding (non-null) +/// └──── type_id +/// +/// ┌──┬────────────────────────────────┐ +/// "abc" │01│02│'a'│'b'│'c'│00│00│00│00│00│03│ +/// └──┴────────────────────────────────┘ +/// │ └─ string encoding (non-null) +/// └──── type_id +/// +/// ┌──┬──────────────┐ +/// null Int32 │00│00│00│00│00│00│ +/// └──┴──────────────┘ +/// │ └─ signed integer encoding (null) +/// └──── type_id +/// +/// ┌──┬──┐ +/// null Utf8 │01│00│ +/// └──┴──┘ +/// │ └─ string encoding (null) +/// └──── type_id +/// ``` +/// +/// See [`UnionArray`] for more details on union types. +/// /// # Ordering /// /// ## Float Ordering @@ -431,6 +466,12 @@ mod variable; /// The encoding described above will order nulls first, this can be inverted by representing /// nulls as `0xFF_u8` instead of `0_u8` /// +/// ## Union Ordering +/// +/// Values of the same type are ordered according to the ordering of that type. +/// Values of different types are ordered by their type id. +/// The type_id is negated when descending order is specified. +/// /// ## Reverse Column Ordering /// /// The order of a given column can be reversed by negating the encoded bytes of non-null values @@ -892,7 +933,7 @@ impl RowConverter { // and therefore must be valid let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?; - if cfg!(test) { + if cfg!(debug_assertions) { for (i, row) in rows.iter().enumerate() { if !row.is_empty() { return Err(ArrowError::InvalidArgumentError(format!( @@ -1131,8 +1172,8 @@ impl Rows { pub fn size(&self) -> usize { // Size of fields is accounted for as part of RowConverter std::mem::size_of::() - + self.buffer.len() - + self.offsets.len() * std::mem::size_of::() + + self.buffer.capacity() + + self.offsets.capacity() * std::mem::size_of::() } /// Create a [BinaryArray] from the [Rows] data without reallocating the @@ -1644,24 +1685,22 @@ fn encode_column( } } DataType::Binary => { - variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) + variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::(column), opts) } DataType::BinaryView => { variable::encode(data, offsets, column.as_binary_view().iter(), opts) } DataType::LargeBinary => { - variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) + variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::(column), opts) } - DataType::Utf8 => variable::encode( + DataType::Utf8 => variable::encode_generic_byte_array( data, offsets, - column.as_string::().iter().map(|x| x.map(|x| x.as_bytes())), + column.as_string::(), opts, ), - DataType::LargeUtf8 => variable::encode( + DataType::LargeUtf8 => variable::encode_generic_byte_array( data, offsets, - column.as_string::() - .iter() - .map(|x| x.map(|x| x.as_bytes())), + column.as_string::(), opts, ), DataType::Utf8View => variable::encode( @@ -1903,12 +1942,9 @@ unsafe fn decode_column( let child_row = &row[1..]; rows_by_field[field_idx].push((idx, child_row)); - - *row = &row[row.len()..]; } let mut child_arrays: Vec = Vec::with_capacity(converters.len()); - let mut offsets = (*mode == UnionMode::Dense).then(|| Vec::with_capacity(len)); for (field_idx, converter) in converters.iter().enumerate() { @@ -1930,6 +1966,14 @@ unsafe fn decode_column( let child_array = unsafe { converter.convert_raw(&mut child_data, validate_utf8) }?; + // advance row slices by the bytes consumed + for ((row_idx, original_bytes), remaining_bytes) in + field_rows.iter().zip(child_data) + { + let consumed_length = 1 + original_bytes.len() - remaining_bytes.len(); + rows[*row_idx] = &rows[*row_idx][consumed_length..]; + } + child_arrays.push(child_array.into_iter().next().unwrap()); } UnionMode::Sparse => { @@ -1951,6 +1995,14 @@ unsafe fn decode_column( let child_array = unsafe { converter.convert_raw(&mut sparse_data, validate_utf8) }?; + + // advance row slices by the bytes consumed for rows that belong to this field + for (row_idx, child_row) in field_rows.iter() { + let remaining_len = sparse_data[*row_idx].len(); + let consumed_length = 1 + child_row.len() - remaining_len; + rows[*row_idx] = &rows[*row_idx][consumed_length..]; + } + child_arrays.push(child_array.into_iter().next().unwrap()); } } @@ -4050,4 +4102,185 @@ mod tests { // "a" < "z" assert!(rows.row(3) < rows.row(1)); } + + #[test] + fn test_row_converter_roundtrip_with_many_union_columns() { + // col 1: Union(Int32, Utf8) [67, "hello"] + let fields1 = UnionFields::try_new( + vec![0, 1], + vec![ + Field::new("int", DataType::Int32, true), + Field::new("string", DataType::Utf8, true), + ], + ) + .unwrap(); + + let int_array1 = Int32Array::from(vec![Some(67), None]); + let string_array1 = StringArray::from(vec![None::<&str>, Some("hello")]); + let type_ids1 = vec![0i8, 1].into(); + + let union_array1 = UnionArray::try_new( + fields1.clone(), + type_ids1, + None, + vec![ + Arc::new(int_array1) as ArrayRef, + Arc::new(string_array1) as ArrayRef, + ], + ) + .unwrap(); + + // col 2: Union(Int32, Utf8) [100, "world"] + let fields2 = UnionFields::try_new( + vec![0, 1], + vec![ + Field::new("int", DataType::Int32, true), + Field::new("string", DataType::Utf8, true), + ], + ) + .unwrap(); + + let int_array2 = Int32Array::from(vec![Some(100), None]); + let string_array2 = StringArray::from(vec![None::<&str>, Some("world")]); + let type_ids2 = vec![0i8, 1].into(); + + let union_array2 = UnionArray::try_new( + fields2.clone(), + type_ids2, + None, + vec![ + Arc::new(int_array2) as ArrayRef, + Arc::new(string_array2) as ArrayRef, + ], + ) + .unwrap(); + + // create a row converter with 2 union columns + let field1 = Field::new("col1", DataType::Union(fields1, UnionMode::Sparse), true); + let field2 = Field::new("col2", DataType::Union(fields2, UnionMode::Sparse), true); + + let sort_field1 = SortField::new(field1.data_type().clone()); + let sort_field2 = SortField::new(field2.data_type().clone()); + + let converter = RowConverter::new(vec![sort_field1, sort_field2]).unwrap(); + + let rows = converter + .convert_columns(&[ + Arc::new(union_array1.clone()) as ArrayRef, + Arc::new(union_array2.clone()) as ArrayRef, + ]) + .unwrap(); + + // roundtrip + let out = converter.convert_rows(&rows).unwrap(); + + let [col1, col2] = out.as_slice() else { + panic!("expected 2 columns") + }; + + let col1 = col1.as_any().downcast_ref::().unwrap(); + let col2 = col2.as_any().downcast_ref::().unwrap(); + + for (expected, got) in [union_array1, union_array2].iter().zip([col1, col2]) { + assert_eq!(expected.len(), got.len()); + assert_eq!(expected.type_ids(), got.type_ids()); + + for i in 0..expected.len() { + assert_eq!(expected.value(i).as_ref(), got.value(i).as_ref()); + } + } + } + + #[test] + fn test_row_converter_roundtrip_with_one_union_column() { + let fields = UnionFields::try_new( + vec![0, 1], + vec![ + Field::new("int", DataType::Int32, true), + Field::new("string", DataType::Utf8, true), + ], + ) + .unwrap(); + + let int_array = Int32Array::from(vec![Some(67), None]); + let string_array = StringArray::from(vec![None::<&str>, Some("hello")]); + let type_ids = vec![0i8, 1].into(); + + let union_array = UnionArray::try_new( + fields.clone(), + type_ids, + None, + vec![ + Arc::new(int_array) as ArrayRef, + Arc::new(string_array) as ArrayRef, + ], + ) + .unwrap(); + + let field = Field::new("col", DataType::Union(fields, UnionMode::Sparse), true); + let sort_field = SortField::new(field.data_type().clone()); + let converter = RowConverter::new(vec![sort_field]).unwrap(); + + let rows = converter + .convert_columns(&[Arc::new(union_array.clone()) as ArrayRef]) + .unwrap(); + + // roundtrip + let out = converter.convert_rows(&rows).unwrap(); + + let [col1] = out.as_slice() else { + panic!("expected 1 column") + }; + + let col = col1.as_any().downcast_ref::().unwrap(); + assert_eq!(col.len(), union_array.len()); + assert_eq!(col.type_ids(), union_array.type_ids()); + + for i in 0..col.len() { + assert_eq!(col.value(i).as_ref(), union_array.value(i).as_ref()); + } + } + + #[test] + fn rows_size_should_count_for_capacity() { + let row_converter = RowConverter::new(vec![SortField::new(DataType::UInt8)]).unwrap(); + + let empty_rows_size_with_preallocate_rows_and_data = { + let rows = row_converter.empty_rows(1000, 1000); + + rows.size() + }; + let empty_rows_size_with_preallocate_rows = { + let rows = row_converter.empty_rows(1000, 0); + + rows.size() + }; + let empty_rows_size_with_preallocate_data = { + let rows = row_converter.empty_rows(0, 1000); + + rows.size() + }; + let empty_rows_size_without_preallocate = { + let rows = row_converter.empty_rows(0, 0); + + rows.size() + }; + + assert!( + empty_rows_size_with_preallocate_rows_and_data > empty_rows_size_with_preallocate_rows, + "{empty_rows_size_with_preallocate_rows_and_data} should be larger than {empty_rows_size_with_preallocate_rows}" + ); + assert!( + empty_rows_size_with_preallocate_rows_and_data > empty_rows_size_with_preallocate_data, + "{empty_rows_size_with_preallocate_rows_and_data} should be larger than {empty_rows_size_with_preallocate_data}" + ); + assert!( + empty_rows_size_with_preallocate_rows > empty_rows_size_without_preallocate, + "{empty_rows_size_with_preallocate_rows} should be larger than {empty_rows_size_without_preallocate}" + ); + assert!( + empty_rows_size_with_preallocate_data > empty_rows_size_without_preallocate, + "{empty_rows_size_with_preallocate_data} should be larger than {empty_rows_size_without_preallocate}" + ); + } } diff --git a/arrow-row/src/run.rs b/arrow-row/src/run.rs index 3d962f43ada8..24eaaa18e018 100644 --- a/arrow-row/src/run.rs +++ b/arrow-row/src/run.rs @@ -134,7 +134,11 @@ pub unsafe fn decode( run_ends.push(R::Native::usize_as(idx)); } unique_row_indices.push(decoded_values.len()); - decoded_values.push(decoded_data.clone()); + let capacity = decoded_data.capacity(); + decoded_values.push(std::mem::replace( + &mut decoded_data, + Vec::with_capacity(capacity), + )); } } // Add the final run end diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index ac2c4cb97c20..73e19b197f92 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -17,9 +17,10 @@ use crate::null_sentinel; use arrow_array::builder::BufferBuilder; +use arrow_array::types::ByteArrayType; use arrow_array::*; -use arrow_buffer::MutableBuffer; use arrow_buffer::bit_util::ceil; +use arrow_buffer::{ArrowNativeType, MutableBuffer}; use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN}; use arrow_schema::{DataType, SortOptions}; use builder::make_view; @@ -84,6 +85,48 @@ pub fn encode<'a, I: Iterator>>( } } +/// Calls [`encode`] with optimized iterator for generic byte arrays +pub(crate) fn encode_generic_byte_array( + data: &mut [u8], + offsets: &mut [usize], + input_array: &GenericByteArray, + opts: SortOptions, +) { + let input_offsets = input_array.value_offsets(); + let bytes = input_array.values().as_slice(); + + if let Some(null_buffer) = input_array.nulls().filter(|x| x.null_count() > 0) { + let input_iter = + input_offsets + .windows(2) + .zip(null_buffer.iter()) + .map(|(start_end, is_valid)| { + if is_valid { + let item_range = start_end[0].as_usize()..start_end[1].as_usize(); + // SAFETY: the offsets of the input are valid by construction + // so it is ok to use unsafe here + let item = unsafe { bytes.get_unchecked(item_range) }; + Some(item) + } else { + None + } + }); + + encode(data, offsets, input_iter, opts); + } else { + // Skip null checks + let input_iter = input_offsets.windows(2).map(|start_end| { + let item_range = start_end[0].as_usize()..start_end[1].as_usize(); + // SAFETY: the offsets of the input are valid by construction + // so it is ok to use unsafe here + let item = unsafe { bytes.get_unchecked(item_range) }; + Some(item) + }); + + encode(data, offsets, input_iter, opts); + } +} + pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize { out[0] = null_sentinel(opts); 1 @@ -97,6 +140,7 @@ pub fn encode_empty(out: &mut [u8], opts: SortOptions) -> usize { 1 } +#[inline] pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usize { match val { None => encode_null(out, opts), diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index cd9bf767e16b..fb6461a9e9ae 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -53,12 +53,9 @@ serde = ["dep:serde_core", "dep:serde"] all-features = true [dev-dependencies] -bincode = { version = "2.0.1", default-features = false, features = [ - "std", - "serde", -] } criterion = { workspace = true, default-features = false } insta = "1.43.1" +postcard = { version = "1.0.10", default-features = false, features = ["use-std"] } [[bench]] name = "ffi" diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index e3f67e6ac06a..40c28649c25b 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -591,6 +591,16 @@ impl DataType { matches!(self, UInt8 | UInt16 | UInt32 | UInt64) } + /// Returns true if this type is decimal: (Decimal*). + #[inline] + pub fn is_decimal(&self) -> bool { + use DataType::*; + matches!( + self, + Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) + ) + } + /// Returns true if this type is valid as a dictionary key #[inline] pub fn is_dictionary_key_type(&self) -> bool { @@ -1168,6 +1178,15 @@ mod tests { assert!(!DataType::is_floating(&DataType::Int32)); } + #[test] + fn test_decimal() { + assert!(DataType::is_decimal(&DataType::Decimal32(4, 2))); + assert!(DataType::is_decimal(&DataType::Decimal64(4, 2))); + assert!(DataType::is_decimal(&DataType::Decimal128(4, 2))); + assert!(DataType::is_decimal(&DataType::Decimal256(4, 2))); + assert!(!DataType::is_decimal(&DataType::Float16)); + } + #[test] fn test_datatype_is_null() { assert!(DataType::is_null(&DataType::Null)); diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 3b3372a78eae..1b9a298e5918 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -1448,10 +1448,8 @@ mod test { #[cfg(feature = "serde")] fn assert_binary_serde_round_trip(field: Field) { - let config = bincode::config::legacy(); - let serialized = bincode::serde::encode_to_vec(&field, config).unwrap(); - let (deserialized, _): (Field, _) = - bincode::serde::decode_from_slice(&serialized, config).unwrap(); + let serialized = postcard::to_stdvec(&field).unwrap(); + let deserialized: Field = postcard::from_bytes(&serialized).unwrap(); assert_eq!(field, deserialized) } diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 211cabf7afc0..fa875c20e302 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -19,11 +19,11 @@ use arrow_array::{Array, ArrayRef, BooleanArray, make_array}; use arrow_buffer::buffer::bitwise_bin_op_helper; -use arrow_buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper}; use arrow_schema::{ArrowError, DataType}; /// Returns a new array with the same values and the validity bit to false where -/// the corresponding element of`right` is true. +/// the corresponding element of `right` is true. /// /// This can be used to implement SQL `NULLIF` /// @@ -91,13 +91,11 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut null_count = 0; - let buffer = - BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }) - .into_inner(); + let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { + let t = !b; + null_count += t.count_zeros() as usize; + t + }); (buffer, null_count) } }; @@ -122,7 +120,8 @@ mod tests { use arrow_array::{Int32Array, NullArray, StringArray, StructArray}; use arrow_data::ArrayData; use arrow_schema::{Field, Fields}; - use rand::{Rng, rng}; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; #[test] fn test_nullif_int_array() { @@ -494,23 +493,60 @@ mod tests { let r_data = r.to_data(); r_data.validate().unwrap(); - assert_eq!(r.as_ref(), &expected); + assert_eq!( + r.as_ref(), + &expected, + "expected nulls: {:#?}\n\n\ + result nulls: {:#?}\n\n\\ + expected values: {:#?}\n\n\ + result values: {:#?}", + expected.nulls(), + r.nulls(), + expected.values(), + r.as_primitive::().values() + ); + validate_nulls(expected.nulls()); + validate_nulls(r.nulls()); + } + + /// Ensures that the null count matches the actual number of nulls. + fn validate_nulls(nulls: Option<&NullBuffer>) { + let Some(nulls) = nulls else { + return; + }; + let mut actual_null_count = 0; + for i in 0..nulls.len() { + if nulls.is_null(i) { + actual_null_count += 1; + } + } + assert_eq!(actual_null_count, nulls.null_count()); } #[test] fn nullif_fuzz() { - let mut rng = rng(); + let mut rng = StdRng::seed_from_u64(7337); let arrays = [ - Int32Array::from(vec![0; 128]), - (0..128) - .map(|_| rng.random_bool(0.5).then_some(0)) + Int32Array::from(vec![0; 1024]), // no nulls + (0..1024) // 50% nulls + .map(|_| rng.random_bool(0.5).then_some(1)) .collect(), ]; for a in arrays { - let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), (32, 0)]; - + let a_slices = [ + (0, 128), + (0, 129), + (64, 64), + (0, 64), + (32, 32), + (0, 0), + (32, 0), + (5, 800), + (33, 53), + (77, 101), + ]; for (a_offset, a_length) in a_slices { let a = a.slice(a_offset, a_length); @@ -518,14 +554,54 @@ mod tests { let b_start_offset = rng.random_range(0..i); let b_end_offset = rng.random_range(0..i); + // b with 50% nulls let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset) .map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5))) .collect(); - let b = b.slice(b_start_offset, a_length); - - test_nullif(&a, &b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); + + // b with no nulls (and no null buffer) + let b = remove_null_buffer(&b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); + + // b with no nulls (but with a null buffer) + let b = remove_null_values(&b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); } } } } + + /// Returns a new BooleanArray with no null buffer + fn remove_null_buffer(array: &BooleanArray) -> BooleanArray { + make_array( + array + .into_data() + .into_builder() + .nulls(None) + .build() + .unwrap(), + ) + .as_boolean() + .clone() + } + + /// Returns a new BooleanArray with a null buffer where all values are valid + fn remove_null_values(array: &BooleanArray) -> BooleanArray { + let len = array.len(); + let new_nulls = NullBuffer::from_iter(std::iter::repeat_n(true, len)); + make_array( + array + .into_data() + .into_builder() + .nulls(Some(new_nulls)) + .build() + .unwrap(), + ) + .as_boolean() + .clone() + } } diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 7f7791a07af0..1961a604d928 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1011,7 +1011,6 @@ to_indices_reinterpret!(Int64Type, UInt64Type); /// # use arrow_array::{StringArray, Int32Array, UInt32Array, RecordBatch}; /// # use arrow_schema::{DataType, Field, Schema}; /// # use arrow_select::take::take_record_batch; -/// /// let schema = Arc::new(Schema::new(vec![ /// Field::new("a", DataType::Int32, true), /// Field::new("b", DataType::Utf8, true), diff --git a/arrow-select/src/window.rs b/arrow-select/src/window.rs index fbd145d08d9d..74f7f4a79191 100644 --- a/arrow-select/src/window.rs +++ b/arrow-select/src/window.rs @@ -29,7 +29,6 @@ use num_traits::abs; /// ``` /// # use arrow_array::Int32Array; /// # use arrow_select::window::shift; -/// /// let a: Int32Array = vec![Some(1), None, Some(4)].into(); /// /// // shift array 1 element to the right diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index e45b817dc6e8..8702b558d01f 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -19,20 +19,23 @@ use crate::filter::{SlicesIterator, prep_null_mask_filter}; use arrow_array::cast::AsArray; -use arrow_array::types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, Utf8Type}; +use arrow_array::types::{ + BinaryType, BinaryViewType, ByteArrayType, ByteViewType, LargeBinaryType, LargeUtf8Type, + StringViewType, Utf8Type, +}; use arrow_array::*; use arrow_buffer::{ BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, OffsetBufferBuilder, - ScalarBuffer, + ScalarBuffer, ToByteSlice, }; -use arrow_data::ArrayData; use arrow_data::transform::MutableArrayData; +use arrow_data::{ArrayData, ByteView}; use arrow_schema::{ArrowError, DataType}; use std::fmt::{Debug, Formatter}; use std::hash::Hash; use std::marker::PhantomData; use std::ops::Not; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; /// Zip two arrays by some boolean mask. /// @@ -284,7 +287,12 @@ impl ScalarZipper { DataType::LargeBinary => { Arc::new(BytesScalarImpl::::new(truthy, falsy)) as Arc }, - // TODO: Handle Utf8View https://github.com/apache/arrow-rs/issues/8724 + DataType::Utf8View => { + Arc::new(ByteViewScalarImpl::::new(truthy, falsy)) as Arc + }, + DataType::BinaryView => { + Arc::new(ByteViewScalarImpl::::new(truthy, falsy)) as Arc + }, _ => { Arc::new(FallbackImpl::new(truthy, falsy)) as Arc }, @@ -657,6 +665,182 @@ fn maybe_prep_null_mask_filter(predicate: &BooleanArray) -> BooleanBuffer { } } +struct ByteViewScalarImpl { + truthy_view: Option, + truthy_buffers: Arc<[Buffer]>, + falsy_view: Option, + falsy_buffers: Arc<[Buffer]>, + phantom: PhantomData, +} + +static EMPTY_ARC: OnceLock> = OnceLock::new(); +fn empty_arc_buffers() -> Arc<[Buffer]> { + Arc::clone(EMPTY_ARC.get_or_init(|| Arc::new([]))) +} + +impl ByteViewScalarImpl { + fn new(truthy: &dyn Array, falsy: &dyn Array) -> Self { + let (truthy_view, truthy_buffers) = Self::get_value_from_scalar(truthy); + let (falsy_view, falsy_buffers) = Self::get_value_from_scalar(falsy); + Self { + truthy_view, + truthy_buffers, + falsy_view, + falsy_buffers, + phantom: PhantomData, + } + } + + fn get_value_from_scalar(scalar: &dyn Array) -> (Option, Arc<[Buffer]>) { + if scalar.is_null(0) { + (None, empty_arc_buffers()) + } else { + let (views, buffers, _) = scalar.as_byte_view::().clone().into_parts(); + (views.first().copied(), buffers) + } + } + + fn get_views_for_single_non_nullable( + predicate: BooleanBuffer, + value: u128, + buffers: Arc<[Buffer]>, + ) -> (ScalarBuffer, Arc<[Buffer]>, Option) { + let number_of_true = predicate.count_set_bits(); + let number_of_values = predicate.len(); + + // Fast path for all nulls + if number_of_true == 0 { + // All values are null + return ( + vec![0; number_of_values].into(), + empty_arc_buffers(), + Some(NullBuffer::new_null(number_of_values)), + ); + } + let bytes = vec![value; number_of_values]; + + // If value is true and we want to handle the TRUTHY case, the null buffer will have 1 (meaning not null) + // If value is false and we want to handle the FALSY case, the null buffer will have 0 (meaning null) + let nulls = NullBuffer::new(predicate); + (bytes.into(), buffers, Some(nulls)) + } + + fn get_views_for_non_nullable( + predicate: BooleanBuffer, + result_len: usize, + truthy_view: u128, + truthy_buffers: Arc<[Buffer]>, + falsy_view: u128, + falsy_buffers: Arc<[Buffer]>, + ) -> (ScalarBuffer, Arc<[Buffer]>, Option) { + let true_count = predicate.count_set_bits(); + match true_count { + 0 => { + // all values are falsy + (vec![falsy_view; result_len].into(), falsy_buffers, None) + } + n if n == predicate.len() => { + // all values are truthy + (vec![truthy_view; result_len].into(), truthy_buffers, None) + } + _ => { + let true_count = predicate.count_set_bits(); + let mut buffers: Vec = truthy_buffers.to_vec(); + + // If the falsy buffers are empty, we can use the falsy view as it is, because the value + // is completely inlined. Otherwise, we have non-inlined values in the buffer, and we need + // to recalculate the falsy view + let view_falsy = if falsy_buffers.is_empty() { + falsy_view + } else { + let byte_view_falsy = ByteView::from(falsy_view); + let new_index_falsy_buffers = + buffers.len() as u32 + byte_view_falsy.buffer_index; + buffers.extend(falsy_buffers.iter().cloned()); + let byte_view_falsy = + byte_view_falsy.with_buffer_index(new_index_falsy_buffers); + byte_view_falsy.as_u128() + }; + + let total_number_of_bytes = true_count * 16 + (predicate.len() - true_count) * 16; + let mut mutable = MutableBuffer::new(total_number_of_bytes); + let mut filled = 0; + + SlicesIterator::from(&predicate).for_each(|(start, end)| { + if start > filled { + let false_repeat_count = start - filled; + mutable + .repeat_slice_n_times(view_falsy.to_byte_slice(), false_repeat_count); + } + let true_repeat_count = end - start; + mutable.repeat_slice_n_times(truthy_view.to_byte_slice(), true_repeat_count); + filled = end; + }); + + if filled < predicate.len() { + let false_repeat_count = predicate.len() - filled; + mutable.repeat_slice_n_times(view_falsy.to_byte_slice(), false_repeat_count); + } + + let bytes = Buffer::from(mutable); + (bytes.into(), buffers.into(), None) + } + } + } +} + +impl Debug for ByteViewScalarImpl { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ByteViewScalarImpl") + .field("truthy", &self.truthy_view) + .field("falsy", &self.falsy_view) + .finish() + } +} + +impl ZipImpl for ByteViewScalarImpl { + fn create_output(&self, predicate: &BooleanArray) -> Result { + let result_len = predicate.len(); + // Nulls are treated as false + let predicate = maybe_prep_null_mask_filter(predicate); + + let (views, buffers, nulls) = match (self.truthy_view, self.falsy_view) { + (Some(truthy), Some(falsy)) => Self::get_views_for_non_nullable( + predicate, + result_len, + truthy, + Arc::clone(&self.truthy_buffers), + falsy, + Arc::clone(&self.falsy_buffers), + ), + (Some(truthy), None) => Self::get_views_for_single_non_nullable( + predicate, + truthy, + Arc::clone(&self.truthy_buffers), + ), + (None, Some(falsy)) => { + let predicate = predicate.not(); + Self::get_views_for_single_non_nullable( + predicate, + falsy, + Arc::clone(&self.falsy_buffers), + ) + } + (None, None) => { + // All values are null + ( + vec![0; result_len].into(), + empty_arc_buffers(), + Some(NullBuffer::new_null(result_len)), + ) + } + }; + + let result = unsafe { GenericByteViewArray::::new_unchecked(views, buffers, nulls) }; + Ok(Arc::new(result)) + } +} + #[cfg(test)] mod test { use super::*; @@ -1222,4 +1406,158 @@ mod test { ]); assert_eq!(actual, &expected); } + + #[test] + fn test_zip_kernel_scalar_strings_array_view() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["world"])); + + let mask = BooleanArray::from(vec![true, false, true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("hello"), + Some("world"), + Some("hello"), + Some("world"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_binary_array_view() { + let scalar_truthy = Scalar::new(BinaryViewArray::from_iter_values(vec![b"hello"])); + let scalar_falsy = Scalar::new(BinaryViewArray::from_iter_values(vec![b"world"])); + + let mask = BooleanArray::from(vec![true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_byte_view(); + let expected = BinaryViewArray::from_iter_values(vec![b"hello", b"world"]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_with_nulls() { + let scalar_truthy = Scalar::new(StringViewArray::from_iter_values(["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::new_null(1)); + + let mask = BooleanArray::from(vec![true, true, false, false, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = StringViewArray::from_iter(vec![ + Some("hello"), + Some("hello"), + None, + None, + Some("hello"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_all_true_null() { + let scalar_truthy = Scalar::new(StringViewArray::new_null(1)); + let scalar_falsy = Scalar::new(StringViewArray::new_null(1)); + let mask = BooleanArray::from(vec![true, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = StringViewArray::from_iter(vec![None::, None]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_all_false_null() { + let scalar_truthy = Scalar::new(StringViewArray::new_null(1)); + let scalar_falsy = Scalar::new(StringViewArray::new_null(1)); + let mask = BooleanArray::from(vec![false, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = StringViewArray::from_iter(vec![None::, None]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_string_array_view_all_true() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["world"])); + + let mask = BooleanArray::from(vec![true, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![Some("hello"), Some("hello")]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_string_array_view_all_false() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["world"])); + + let mask = BooleanArray::from(vec![false, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![Some("world"), Some("world")]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_large_strings() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["another longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("longer than 12 bytes"), + Some("another longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_large_short_strings() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![true, false, true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("hello"), + Some("longer than 12 bytes"), + Some("hello"), + Some("longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } + #[test] + fn test_zip_kernel_scalar_strings_array_view_large_all_true() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["another longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![true, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("longer than 12 bytes"), + Some("longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_large_all_false() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["another longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![false, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("another longer than 12 bytes"), + Some("another longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } } diff --git a/arrow/benches/zip_kernels.rs b/arrow/benches/zip_kernels.rs index 31cbca639717..65f6bb280f00 100644 --- a/arrow/benches/zip_kernels.rs +++ b/arrow/benches/zip_kernels.rs @@ -21,6 +21,7 @@ use rand::distr::{Distribution, StandardUniform}; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; use std::hint; +use std::ops::Range; use std::sync::Arc; use arrow::array::*; @@ -133,6 +134,35 @@ where } } +struct GenerateStringView { + range: Range, + description: String, + _marker: std::marker::PhantomData, +} + +impl InputGenerator for GenerateStringView { + fn name(&self) -> &str { + self.description.as_str() + } + fn generate_scalar_with_null_value(&self) -> ArrayRef { + new_null_array(&DataType::Utf8View, 1) + } + + fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) -> Vec { + let array = self.generate_array(seed, number_of_scalars, 0.0); + (0..number_of_scalars).map(|i| array.slice(i, 1)).collect() + } + + fn generate_array(&self, seed: u64, array_length: usize, null_percentage: f32) -> ArrayRef { + Arc::new(create_string_view_array_with_len_range_and_seed( + array_length, + null_percentage, + self.range.clone(), + seed, + )) + } +} + fn mask_cases(len: usize) -> Vec<(&'static str, BooleanArray)> { vec![ ("all_true", create_boolean_array(len, 0.0, 1.0)), @@ -273,6 +303,24 @@ fn add_benchmark(c: &mut Criterion) { _marker: std::marker::PhantomData, }, ); + + bench_zip_on_input_generator( + c, + &GenerateStringView { + description: "string_views size (3..10)".to_string(), + range: 3..10, + _marker: std::marker::PhantomData, + }, + ); + + bench_zip_on_input_generator( + c, + &GenerateStringView { + description: "string_views size (10..100)".to_string(), + range: 10..100, + _marker: std::marker::PhantomData, + }, + ); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 9f83a50f4f8f..1f1dcff9b62a 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -208,6 +208,33 @@ pub fn create_string_array_with_len_range_and_prefix_and_seed, + seed: u64, +) -> StringViewArray { + let rng = &mut StdRng::seed_from_u64(seed); + (0..size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + let str_len = rng.random_range(range.clone()); + let value = rng.sample_iter(&Alphanumeric).take(str_len).collect(); + let value = String::from_utf8(value).unwrap(); + Some(value) + } + }) + .collect() +} fn create_string_view_array_with_len_range_and_prefix( size: usize, diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 89bbe4b1fbcb..023436e0a7f7 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -66,110 +66,72 @@ pub fn create_random_batch( pub fn create_random_array( field: &Field, size: usize, - null_density: f32, + mut null_density: f32, true_density: f32, ) -> Result { - // Override null density with 0.0 if the array is non-nullable - // and a primitive type in case a nested field is nullable - let primitive_null_density = match field.is_nullable() { - true => null_density, - false => 0.0, - }; + // Override nullability in case of not nested and not dictionary + // For nested we don't want to override as we want to keep the nullability for the children + // For dictionary it handle the nullability internally + if !field.data_type().is_nested() && !matches!(field.data_type(), Dictionary(_, _)) { + // Override null density with 0.0 if the array is non-nullable + null_density = match field.is_nullable() { + true => null_density, + false => 0.0, + }; + } + use DataType::*; - Ok(match field.data_type() { + let array = match field.data_type() { Null => Arc::new(NullArray::new(size)) as ArrayRef, - Boolean => Arc::new(create_boolean_array( - size, - primitive_null_density, - true_density, - )), - Int8 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Int16 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Int32 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Int64 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt8 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt16 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt32 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt64 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), + Boolean => Arc::new(create_boolean_array(size, null_density, true_density)), + Int8 => Arc::new(create_primitive_array::(size, null_density)), + Int16 => Arc::new(create_primitive_array::(size, null_density)), + Int32 => Arc::new(create_primitive_array::(size, null_density)), + Int64 => Arc::new(create_primitive_array::(size, null_density)), + UInt8 => Arc::new(create_primitive_array::(size, null_density)), + UInt16 => Arc::new(create_primitive_array::(size, null_density)), + UInt32 => Arc::new(create_primitive_array::(size, null_density)), + UInt64 => Arc::new(create_primitive_array::(size, null_density)), Float16 => { return Err(ArrowError::NotYetImplemented( "Float16 is not implemented".to_string(), )); } - Float32 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Float64 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), + Float32 => Arc::new(create_primitive_array::(size, null_density)), + Float64 => Arc::new(create_primitive_array::(size, null_density)), Timestamp(unit, tz) => match unit { TimeUnit::Second => Arc::new( - create_random_temporal_array::(size, primitive_null_density) + create_random_temporal_array::(size, null_density) .with_timezone_opt(tz.clone()), - ), + ) as ArrayRef, TimeUnit::Millisecond => Arc::new( - create_random_temporal_array::( - size, - primitive_null_density, - ) - .with_timezone_opt(tz.clone()), + create_random_temporal_array::(size, null_density) + .with_timezone_opt(tz.clone()), ), TimeUnit::Microsecond => Arc::new( - create_random_temporal_array::( - size, - primitive_null_density, - ) - .with_timezone_opt(tz.clone()), + create_random_temporal_array::(size, null_density) + .with_timezone_opt(tz.clone()), ), TimeUnit::Nanosecond => Arc::new( - create_random_temporal_array::( - size, - primitive_null_density, - ) - .with_timezone_opt(tz.clone()), + create_random_temporal_array::(size, null_density) + .with_timezone_opt(tz.clone()), ), }, Date32 => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )), Date64 => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )), Time32(unit) => match unit { TimeUnit::Second => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )) as ArrayRef, TimeUnit::Millisecond => Arc::new( - create_random_temporal_array::(size, primitive_null_density), + create_random_temporal_array::(size, null_density), ), _ => { return Err(ArrowError::InvalidArgumentError(format!( @@ -179,11 +141,11 @@ pub fn create_random_array( }, Time64(unit) => match unit { TimeUnit::Microsecond => Arc::new( - create_random_temporal_array::(size, primitive_null_density), + create_random_temporal_array::(size, null_density), ) as ArrayRef, TimeUnit::Nanosecond => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )), _ => { return Err(ArrowError::InvalidArgumentError(format!( @@ -191,24 +153,19 @@ pub fn create_random_array( ))); } }, - Utf8 => Arc::new(create_string_array::(size, primitive_null_density)), - LargeUtf8 => Arc::new(create_string_array::(size, primitive_null_density)), + Utf8 => Arc::new(create_string_array::(size, null_density)), + LargeUtf8 => Arc::new(create_string_array::(size, null_density)), Utf8View => Arc::new(create_string_view_array_with_len( size, - primitive_null_density, + null_density, 4, false, )), - Binary => Arc::new(create_binary_array::(size, primitive_null_density)), - LargeBinary => Arc::new(create_binary_array::(size, primitive_null_density)), - FixedSizeBinary(len) => Arc::new(create_fsb_array( - size, - primitive_null_density, - *len as usize, - )), + Binary => Arc::new(create_binary_array::(size, null_density)), + LargeBinary => Arc::new(create_binary_array::(size, null_density)), + FixedSizeBinary(len) => Arc::new(create_fsb_array(size, null_density, *len as usize)), BinaryView => Arc::new( - create_string_view_array_with_len(size, primitive_null_density, 4, false) - .to_binary_view(), + create_string_view_array_with_len(size, null_density, 4, false).to_binary_view(), ), List(_) => create_random_list_array(field, size, null_density, true_density)?, LargeList(_) => create_random_list_array(field, size, null_density, true_density)?, @@ -230,7 +187,13 @@ pub fn create_random_array( "Generating random arrays not yet implemented for {other:?}" ))); } - }) + }; + + if !field.is_nullable() { + assert_eq!(array.null_count(), 0); + } + + Ok(array) } #[inline] @@ -812,4 +775,23 @@ mod tests { assert_eq!(array.len(), size); } } + + #[test] + fn create_non_nullable_decimal_array_with_null_density() { + let size = 10; + let fields = vec![ + Field::new("a", DataType::Decimal128(10, -2), false), + Field::new("b", DataType::Decimal256(10, -2), false), + ]; + let schema = Schema::new(fields); + let schema_ref = Arc::new(schema); + let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap(); + + assert_eq!(batch.schema(), schema_ref); + assert_eq!(batch.num_columns(), schema_ref.fields().len()); + for array in batch.columns() { + assert_eq!(array.len(), size); + assert_eq!(array.null_count(), 0); + } + } } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 2812988382d3..7f0195bbd7bb 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="57.0.0" -FUTURE_RELEASE="57.1.0" +SINCE_TAG="57.1.0" +FUTURE_RELEASE="57.2.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index 74c3dd3fb72f..85d66a9cf706 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -37,6 +37,7 @@ parquet-variant = { workspace = true } parquet-variant-json = { workspace = true } chrono = { workspace = true } uuid = { version = "1.18.0", features = ["v4"]} +serde_json = "1.0" [lib] name = "parquet_variant_compute" diff --git a/parquet-variant-compute/benches/variant_kernels.rs b/parquet-variant-compute/benches/variant_kernels.rs index 13ff77d9fb18..383697ab8cc6 100644 --- a/parquet-variant-compute/benches/variant_kernels.rs +++ b/parquet-variant-compute/benches/variant_kernels.rs @@ -23,12 +23,15 @@ use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantBuilder}; use parquet_variant_compute::{ GetOptions, VariantArray, VariantArrayBuilder, json_to_variant, variant_get, }; +use parquet_variant_json::append_json; use rand::Rng; use rand::SeedableRng; use rand::distr::Alphanumeric; use rand::rngs::StdRng; +use serde_json::Value; use std::fmt::Write; use std::sync::Arc; + fn benchmark_batch_json_string_to_variant(c: &mut Criterion) { let input_array = StringArray::from_iter_values(json_repeated_struct(8000)); let array_ref: ArrayRef = Arc::new(input_array); @@ -66,6 +69,58 @@ fn benchmark_batch_json_string_to_variant(c: &mut Criterion) { }); }); + let input_array = StringArray::from_iter_values(random_structure(8000, 200)); + let total_input_bytes = input_array + .iter() + .flatten() // filter None + .map(|v| v.len()) + .sum::(); + let id = format!( + "batch_json_string_to_variant object - 1 depth(200 fields) random_json({} bytes per document)", + total_input_bytes / input_array.len() + ); + let array_ref: ArrayRef = Arc::new(input_array); + let string_array = array_ref.as_any().downcast_ref::().unwrap(); + let mut json_array: Vec = Vec::with_capacity(string_array.len()); + for i in 0..string_array.len() { + json_array.push(serde_json::from_str(string_array.value(i)).unwrap()); + } + c.bench_function(&id, |b| { + b.iter(|| { + let mut variant_array_builder = VariantArrayBuilder::new(string_array.len()); + for json in &json_array { + append_json(json, &mut variant_array_builder).unwrap(); + } + let _ = variant_array_builder.build(); + }); + }); + + let input_array = StringArray::from_iter_values(random_structure(8000, 100)); + let total_input_bytes = input_array + .iter() + .flatten() // filter None + .map(|v| v.len()) + .sum::(); + let id = format!( + "batch_json_string_to_variant object - 1 depth(100 fields) random_json({} bytes per document)", + total_input_bytes / input_array.len() + ); + let array_ref: ArrayRef = Arc::new(input_array); + let string_array = array_ref.as_any().downcast_ref::().unwrap(); + let mut json_array: Vec = Vec::with_capacity(string_array.len()); + for i in 0..string_array.len() { + json_array.push(serde_json::from_str(string_array.value(i)).unwrap()); + } + c.bench_function(&id, |b| { + b.iter(|| { + let mut variant_array_builder = VariantArrayBuilder::new(string_array.len()); + for json in &json_array { + append_json(json, &mut variant_array_builder).unwrap(); + } + let _ = variant_array_builder.build(); + }); + }); + let input_array = StringArray::from_iter_values(random_json_structure(8000)); let total_input_bytes = input_array .iter() @@ -240,6 +295,22 @@ fn random_json_structure(count: usize) -> impl Iterator { (0..count).map(move |_| generator.next().to_string()) } +fn random_structure(count: usize, max_fields: usize) -> impl Iterator { + let mut generator = RandomJsonGenerator { + null_weight: 5, + string_weight: 25, + number_weight: 25, + boolean_weight: 10, + object_weight: 25, + array_weight: 0, + max_fields, + max_array_length: 0, + max_depth: 1, + ..Default::default() + }; + (0..count).map(move |_| generator.next_object().to_string()) +} + /// Creates JSON with random structure and fields. /// /// Each type is created in proportion controlled by the @@ -299,6 +370,82 @@ impl RandomJsonGenerator { &self.output_buffer } + fn next_object(&mut self) -> &str { + self.output_buffer.clear(); + self.append_random_json_for_object(); + &self.output_buffer + } + + fn append_random_json_for_object(&mut self) { + // use destructuring to ensure each field is used + let Self { + rng, + null_weight, + string_weight, + number_weight, + boolean_weight, + max_fields, + output_buffer, + .. + } = self; + + write!(output_buffer, "{{").unwrap(); + for i in 0..*max_fields { + let key_length = rng.random_range(1..=20); + let key: String = (0..key_length) + .map(|_| rng.sample(Alphanumeric) as char) + .collect(); + write!(output_buffer, "\"{key}\":").unwrap(); + + let total_weight = *null_weight + *string_weight + *number_weight + *boolean_weight; + + // Generate a random number to determine the type + let mut random_value: usize = rng.random_range(0..total_weight); + + if random_value <= *null_weight { + write!(output_buffer, "null").unwrap(); + } else { + random_value -= *null_weight; + + if random_value <= *string_weight { + // Generate a random string between 1 and 20 characters + let length = rng.random_range(1..=20); + let random_string: String = (0..length) + .map(|_| rng.sample(Alphanumeric) as char) + .collect(); + write!(output_buffer, "\"{random_string}\"",).unwrap(); + } else { + random_value -= *string_weight; + + if random_value <= *number_weight { + // 50% chance of generating an integer or a float + if rng.random_bool(0.5) { + // Generate a random integer + let random_integer: i64 = rng.random_range(-1000..1000); + write!(output_buffer, "{random_integer}",).unwrap(); + } else { + // Generate a random float + let random_float: f64 = rng.random_range(-1000.0..1000.0); + write!(output_buffer, "{random_float}",).unwrap(); + } + } else { + random_value -= *number_weight; + + if random_value <= *boolean_weight { + // Generate a random boolean + let random_boolean: bool = rng.random(); + write!(output_buffer, "{random_boolean}",).unwrap(); + } + } + } + } + if i < *max_fields - 1 { + write!(output_buffer, ",").unwrap(); + } + } + write!(&mut self.output_buffer, "}}").unwrap(); + } + /// Appends a random JSON value to the output buffer. fn append_random_json(&mut self, current_depth: usize) { // use destructuring to ensure each field is used diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index 3009b602cb80..be241a9a4e00 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::type_conversion::CastOptions; use arrow::array::{ Array, ArrayRef, AsArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericListViewArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; -use arrow::compute::kernels::cast; +use arrow::compute::{CastOptions, kernels::cast}; use arrow::datatypes::{ self as datatypes, ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, DecimalType, RunEndIndexType, @@ -367,7 +366,7 @@ macro_rules! define_row_builder { $( // NOTE: The `?` macro expansion fails without the type annotation. let Some(value): Option<$option_ty> = value else { - if self.options.strict { + if !self.options.safe { return Err(ArrowError::ComputeError(format!( "Failed to convert value at index {index}: conversion failed", ))); @@ -404,7 +403,7 @@ define_row_builder!( where V: VariantDecimalType, { - options: &'a CastOptions, + options: &'a CastOptions<'a>, scale: i8, }, |array| -> PrimitiveArray { array.as_primitive() }, @@ -414,7 +413,7 @@ define_row_builder!( // Decimal256 needs a two-stage conversion via i128 define_row_builder!( struct Decimal256ArrowToVariantBuilder<'a> { - options: &'a CastOptions, + options: &'a CastOptions<'a>, scale: i8, }, |array| -> arrow::array::Decimal256Array { array.as_primitive() }, @@ -426,7 +425,7 @@ define_row_builder!( define_row_builder!( struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { - options: &'a CastOptions, + options: &'a CastOptions<'a>, has_time_zone: bool, }, |array| -> PrimitiveArray { array.as_primitive() }, @@ -450,7 +449,7 @@ define_row_builder!( where i64: From, { - options: &'a CastOptions, + options: &'a CastOptions<'a>, }, |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { @@ -464,7 +463,7 @@ define_row_builder!( where i64: From, { - options: &'a CastOptions, + options: &'a CastOptions<'a>, }, |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { @@ -899,7 +898,13 @@ mod tests { /// Builds a VariantArray from an Arrow array using the row builder. fn execute_row_builder_test(array: &dyn Array) -> VariantArray { - execute_row_builder_test_with_options(array, CastOptions::default()) + execute_row_builder_test_with_options( + array, + CastOptions { + safe: false, + ..Default::default() + }, + ) } /// Variant of `execute_row_builder_test` that allows specifying options @@ -925,7 +930,14 @@ mod tests { /// Generic helper function to test row builders with basic assertion patterns. /// Uses execute_row_builder_test and adds simple value comparison assertions. fn test_row_builder_basic(array: &dyn Array, expected_values: Vec>) { - test_row_builder_basic_with_options(array, expected_values, CastOptions::default()); + test_row_builder_basic_with_options( + array, + expected_values, + CastOptions { + safe: false, + ..Default::default() + }, + ); } /// Variant of `test_row_builder_basic` that allows specifying options @@ -1058,7 +1070,10 @@ mod tests { let run_ends = Int32Array::from(vec![2, 5, 6]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array, &options).unwrap(); @@ -1084,7 +1099,10 @@ mod tests { let run_ends = Int32Array::from(vec![2, 4, 5]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array, &options).unwrap(); let mut array_builder = VariantArrayBuilder::new(5); @@ -1135,7 +1153,10 @@ mod tests { let keys = Int32Array::from(vec![Some(0), None, Some(1), None, Some(2)]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array, &options) .unwrap(); @@ -1167,7 +1188,10 @@ mod tests { let keys = Int32Array::from(vec![0, 1, 2, 0, 1, 2]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array, &options) .unwrap(); @@ -1207,7 +1231,10 @@ mod tests { let dict_array = DictionaryArray::::try_new(keys, Arc::new(struct_array)).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array, &options) .unwrap(); @@ -1302,7 +1329,10 @@ mod tests { // Slice to get just the middle element: [[3, 4, 5]] let sliced_array = list_array.slice(1, 1); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(sliced_array.data_type(), &sliced_array, &options) .unwrap(); @@ -1346,7 +1376,10 @@ mod tests { Some(arrow::buffer::NullBuffer::from(vec![true, false])), ); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list, &options) .unwrap(); @@ -1533,7 +1566,10 @@ mod tests { .unwrap(); // Test the row builder - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(union_array.data_type(), &union_array, &options) .unwrap(); @@ -1585,7 +1621,10 @@ mod tests { .unwrap(); // Test the row builder - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(union_array.data_type(), &union_array, &options) .unwrap(); @@ -1663,7 +1702,7 @@ mod tests { Some(Variant::Null), // Overflow value becomes Variant::Null Some(Variant::from(VariantDecimal16::try_new(123, 3).unwrap())), ], - CastOptions { strict: false }, + CastOptions::default(), ); } diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index c3ffc7a42cc7..b6c968b0678d 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -16,8 +16,9 @@ // under the License. use crate::arrow_to_variant::make_arrow_to_variant_row_builder; -use crate::{CastOptions, VariantArray, VariantArrayBuilder}; +use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::Array; +use arrow::compute::CastOptions; use arrow_schema::ArrowError; /// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you @@ -75,9 +76,15 @@ pub fn cast_to_variant_with_options( /// failures). /// /// This function provides backward compatibility. For non-strict behavior, -/// use [`cast_to_variant_with_options`] with `CastOptions { strict: false }`. +/// use [`cast_to_variant_with_options`] with `CastOptions { safe: true, ..Default::default() }`. pub fn cast_to_variant(input: &dyn Array) -> Result { - cast_to_variant_with_options(input, &CastOptions::default()) + cast_to_variant_with_options( + input, + &CastOptions { + safe: false, + ..Default::default() + }, + ) } #[cfg(test)] @@ -2255,14 +2262,17 @@ mod tests { } fn run_test(values: ArrayRef, expected: Vec>) { - run_test_with_options(values, expected, CastOptions { strict: false }); + run_test_with_options(values, expected, CastOptions::default()); } fn run_test_in_strict_mode( values: ArrayRef, expected: Result>, ArrowError>, ) { - let options = CastOptions { strict: true }; + let options = CastOptions { + safe: false, + ..Default::default() + }; match expected { Ok(expected) => run_test_with_options(values, expected, options), Err(_) => { diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 9b8008f58422..b05d0e023653 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -58,6 +58,5 @@ pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; pub use from_json::json_to_variant; pub use shred_variant::{IntoShreddingField, ShreddedSchemaBuilder, shred_variant}; pub use to_json::variant_to_json; -pub use type_conversion::CastOptions; pub use unshred_variant::unshred_variant; pub use variant_get::{GetOptions, variant_get}; diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 45e7fc95c9f9..7f253d249dfb 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -19,19 +19,17 @@ use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::variant_to_arrow::{ - PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder, + ArrayVariantToArrowRowBuilder, PrimitiveVariantToArrowRowBuilder, + make_primitive_variant_to_arrow_row_builder, }; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow::array::{ - ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, NullBufferBuilder, - OffsetSizeTrait, -}; -use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder}; +use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; -use arrow::datatypes::{ArrowNativeTypeOp, DataType, Field, FieldRef, Fields, TimeUnit}; +use arrow::datatypes::{DataType, Field, FieldRef, Fields, TimeUnit}; use arrow::error::{ArrowError, Result}; use indexmap::IndexMap; -use parquet_variant::{Variant, VariantBuilderExt, VariantList, VariantPath, VariantPathElement}; +use parquet_variant::{Variant, VariantBuilderExt, VariantPath, VariantPathElement}; use std::collections::BTreeMap; use std::sync::Arc; @@ -123,7 +121,8 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( DataType::List(_) | DataType::LargeList(_) | DataType::ListView(_) - | DataType::LargeListView(_) => { + | DataType::LargeListView(_) + | DataType::FixedSizeList(..) => { let typed_value_builder = VariantToShreddedArrayVariantRowBuilder::try_new( data_type, cast_options, @@ -131,11 +130,6 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( )?; VariantToShreddedVariantRowBuilder::Array(typed_value_builder) } - DataType::FixedSizeList(..) => { - return Err(ArrowError::NotYetImplemented( - "Shredding variant array values as fixed-size lists".to_string(), - )); - } // Supported shredded primitive types, see Variant shredding spec: // https://github.com/apache/parquet-format/blob/master/VariantShredding.md#shredded-value-types DataType::Boolean @@ -312,171 +306,6 @@ impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> { } } -enum ArrayVariantToArrowRowBuilder<'a> { - List(VariantToListArrowRowBuilder<'a, i32, false>), - LargeList(VariantToListArrowRowBuilder<'a, i64, false>), - ListView(VariantToListArrowRowBuilder<'a, i32, true>), - LargeListView(VariantToListArrowRowBuilder<'a, i64, true>), -} - -impl<'a> ArrayVariantToArrowRowBuilder<'a> { - fn try_new( - data_type: &'a DataType, - cast_options: &'a CastOptions, - capacity: usize, - ) -> Result { - use ArrayVariantToArrowRowBuilder::*; - - // Make List/ListView builders without repeating the constructor boilerplate. - macro_rules! make_list_builder { - ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => { - $variant(VariantToListArrowRowBuilder::<$offset, $is_view>::try_new( - $field.clone(), - $field.data_type(), - cast_options, - capacity, - )?) - }; - } - - let builder = match data_type { - DataType::List(field) => make_list_builder!(List, i32, false, field), - DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field), - DataType::ListView(field) => make_list_builder!(ListView, i32, true, field), - DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field), - other => { - return Err(ArrowError::InvalidArgumentError(format!( - "Casting to {other:?} is not applicable for array Variant types" - ))); - } - }; - Ok(builder) - } - - fn append_null(&mut self) { - match self { - Self::List(builder) => builder.append_null(), - Self::LargeList(builder) => builder.append_null(), - Self::ListView(builder) => builder.append_null(), - Self::LargeListView(builder) => builder.append_null(), - } - } - - fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { - match self { - Self::List(builder) => builder.append_value(list), - Self::LargeList(builder) => builder.append_value(list), - Self::ListView(builder) => builder.append_value(list), - Self::LargeListView(builder) => builder.append_value(list), - } - } - - fn finish(self) -> Result { - match self { - Self::List(builder) => builder.finish(), - Self::LargeList(builder) => builder.finish(), - Self::ListView(builder) => builder.finish(), - Self::LargeListView(builder) => builder.finish(), - } - } -} - -struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool> -where - O: OffsetSizeTrait + ArrowNativeTypeOp, -{ - field: FieldRef, - offsets: Vec, - element_builder: Box>, - nulls: NullBufferBuilder, - current_offset: O, -} - -impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW> -where - O: OffsetSizeTrait + ArrowNativeTypeOp, -{ - fn try_new( - field: FieldRef, - element_data_type: &'a DataType, - cast_options: &'a CastOptions, - capacity: usize, - ) -> Result { - if capacity >= isize::MAX as usize { - return Err(ArrowError::ComputeError( - "Capacity exceeds isize::MAX when reserving list offsets".to_string(), - )); - } - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(O::ZERO); - let element_builder = make_variant_to_shredded_variant_arrow_row_builder( - element_data_type, - cast_options, - capacity, - false, - )?; - Ok(Self { - field, - offsets, - element_builder: Box::new(element_builder), - nulls: NullBufferBuilder::new(capacity), - current_offset: O::ZERO, - }) - } - - fn append_null(&mut self) { - self.offsets.push(self.current_offset); - self.nulls.append_null(); - } - - fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { - for element in list.iter() { - self.element_builder.append_value(element)?; - self.current_offset = self.current_offset.add_checked(O::ONE)?; - } - self.offsets.push(self.current_offset); - self.nulls.append_non_null(); - Ok(()) - } - - fn finish(mut self) -> Result { - let (value, typed_value, nulls) = self.element_builder.finish()?; - let element_array = - ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); - let field = Arc::new( - self.field - .as_ref() - .clone() - .with_data_type(element_array.data_type().clone()), - ); - - if IS_VIEW { - // NOTE: `offsets` is never empty (constructor pushes an entry) - let mut sizes = Vec::with_capacity(self.offsets.len() - 1); - for i in 1..self.offsets.len() { - sizes.push(self.offsets[i] - self.offsets[i - 1]); - } - self.offsets.pop(); - let list_view_array = GenericListViewArray::::new( - field, - ScalarBuffer::from(self.offsets), - ScalarBuffer::from(sizes), - ArrayRef::from(element_array), - self.nulls.finish(), - ); - Ok(Arc::new(list_view_array)) - } else { - let list_array = GenericListArray::::new( - field, - OffsetBuffer::::new(ScalarBuffer::from(self.offsets)), - ArrayRef::from(element_array), - self.nulls.finish(), - ); - Ok(Arc::new(list_array)) - } - } -} - pub(crate) struct VariantToShreddedObjectVariantRowBuilder<'a> { value_builder: VariantValueArrayBuilder, typed_value_builders: IndexMap<&'a str, VariantToShreddedVariantRowBuilder<'a>>, @@ -1513,6 +1342,22 @@ mod tests { ); } + #[test] + fn test_array_shredding_as_fixed_size_list() { + let input = build_variant_array(vec![VariantRow::List(vec![ + VariantValue::from(1i64), + VariantValue::from(2i64), + VariantValue::from(3i64), + ])]); + let list_schema = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2); + let err = shred_variant(&input, &list_schema).unwrap_err(); + assert_eq!( + err.to_string(), + "Not yet implemented: Converting unshredded variant arrays to arrow fixed-size lists" + ); + } + #[test] fn test_array_shredding_with_array_elements() { let input = build_variant_array(vec![ diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 01065175653f..6a0a743c9029 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -25,19 +25,6 @@ use arrow::datatypes::{ use chrono::Timelike; use parquet_variant::{Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16}; -/// Options for controlling the behavior of `cast_to_variant_with_options`. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct CastOptions { - /// If true, return error on conversion failure. If false, insert null for failed conversions. - pub strict: bool, -} - -impl Default for CastOptions { - fn default() -> Self { - Self { strict: true } - } -} - /// Extension trait for Arrow primitive types that can extract their native value from a Variant pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType { fn from_variant(variant: &Variant<'_, '_>) -> Option; diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 57d9944bb527..172bd4811bc3 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -15,23 +15,117 @@ // specific language governing permissions and limitations // under the License. +use crate::shred_variant::{ + VariantToShreddedVariantRowBuilder, make_variant_to_shredded_variant_arrow_row_builder, +}; +use crate::type_conversion::{ + PrimitiveFromVariant, TimestampFromVariant, variant_to_unscaled_decimal, +}; +use crate::variant_array::ShreddedVariantFieldArray; +use crate::{VariantArray, VariantValueArrayBuilder}; use arrow::array::{ - ArrayRef, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, BinaryViewBuilder, - BooleanBuilder, FixedSizeBinaryBuilder, LargeBinaryBuilder, LargeStringBuilder, NullArray, - NullBufferBuilder, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, + ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, + BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray, + GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, + OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, }; +use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::{CastOptions, DecimalCast}; use arrow::datatypes::{self, DataType, DecimalType}; use arrow::error::{ArrowError, Result}; -use parquet_variant::{Variant, VariantPath}; +use arrow_schema::{FieldRef, TimeUnit}; +use parquet_variant::{Variant, VariantList, VariantPath}; +use std::sync::Arc; -use crate::type_conversion::{ - PrimitiveFromVariant, TimestampFromVariant, variant_to_unscaled_decimal, -}; -use crate::{VariantArray, VariantValueArrayBuilder}; +/// Builder for converting variant values into strongly typed Arrow arrays. +/// +/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly +/// with casting of leaf values to specific types. +pub(crate) enum VariantToArrowRowBuilder<'a> { + Primitive(PrimitiveVariantToArrowRowBuilder<'a>), + BinaryVariant(VariantToBinaryVariantArrowRowBuilder), -use arrow_schema::TimeUnit; -use std::sync::Arc; + // Path extraction wrapper - contains a boxed enum for any of the above + WithPath(VariantPathRowBuilder<'a>), +} + +impl<'a> VariantToArrowRowBuilder<'a> { + pub fn append_null(&mut self) -> Result<()> { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.append_null(), + BinaryVariant(b) => b.append_null(), + WithPath(path_builder) => path_builder.append_null(), + } + } + + pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.append_value(&value), + BinaryVariant(b) => b.append_value(value), + WithPath(path_builder) => path_builder.append_value(value), + } + } + + pub fn finish(self) -> Result { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.finish(), + BinaryVariant(b) => b.finish(), + WithPath(path_builder) => path_builder.finish(), + } + } +} + +pub(crate) fn make_variant_to_arrow_row_builder<'a>( + metadata: &BinaryViewArray, + path: VariantPath<'a>, + data_type: Option<&'a DataType>, + cast_options: &'a CastOptions, + capacity: usize, +) -> Result> { + use VariantToArrowRowBuilder::*; + + let mut builder = match data_type { + // If no data type was requested, build an unshredded VariantArray. + None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( + metadata.clone(), + capacity, + )), + Some(DataType::Struct(_)) => { + return Err(ArrowError::NotYetImplemented( + "Converting unshredded variant objects to arrow structs".to_string(), + )); + } + Some( + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(..), + ) => { + return Err(ArrowError::NotYetImplemented( + "Converting unshredded variant arrays to arrow lists".to_string(), + )); + } + Some(data_type) => { + let builder = + make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; + Primitive(builder) + } + }; + + // Wrap with path extraction if needed + if !path.is_empty() { + builder = WithPath(VariantPathRowBuilder { + builder: Box::new(builder), + path, + }) + }; + + Ok(builder) +} /// Builder for converting primitive variant values to Arrow arrays. It is used by both /// `VariantToArrowRowBuilder` (below) and `VariantToShreddedPrimitiveVariantRowBuilder` (in @@ -81,18 +175,6 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>), } -/// Builder for converting variant values into strongly typed Arrow arrays. -/// -/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly -/// with casting of leaf values to specific types. -pub(crate) enum VariantToArrowRowBuilder<'a> { - Primitive(PrimitiveVariantToArrowRowBuilder<'a>), - BinaryVariant(VariantToBinaryVariantArrowRowBuilder), - - // Path extraction wrapper - contains a boxed enum for any of the above - WithPath(VariantPathRowBuilder<'a>), -} - impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { pub fn append_null(&mut self) -> Result<()> { use PrimitiveVariantToArrowRowBuilder::*; @@ -227,35 +309,6 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { } } -impl<'a> VariantToArrowRowBuilder<'a> { - pub fn append_null(&mut self) -> Result<()> { - use VariantToArrowRowBuilder::*; - match self { - Primitive(b) => b.append_null(), - BinaryVariant(b) => b.append_null(), - WithPath(path_builder) => path_builder.append_null(), - } - } - - pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result { - use VariantToArrowRowBuilder::*; - match self { - Primitive(b) => b.append_value(&value), - BinaryVariant(b) => b.append_value(value), - WithPath(path_builder) => path_builder.append_value(value), - } - } - - pub fn finish(self) -> Result { - use VariantToArrowRowBuilder::*; - match self { - Primitive(b) => b.finish(), - BinaryVariant(b) => b.finish(), - WithPath(path_builder) => path_builder.finish(), - } - } -} - /// Creates a row builder that converts primitive `Variant` values into the requested Arrow data type. pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( data_type: &'a DataType, @@ -427,53 +480,78 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( Ok(builder) } -pub(crate) fn make_variant_to_arrow_row_builder<'a>( - metadata: &BinaryViewArray, - path: VariantPath<'a>, - data_type: Option<&'a DataType>, - cast_options: &'a CastOptions, - capacity: usize, -) -> Result> { - use VariantToArrowRowBuilder::*; +pub(crate) enum ArrayVariantToArrowRowBuilder<'a> { + List(VariantToListArrowRowBuilder<'a, i32, false>), + LargeList(VariantToListArrowRowBuilder<'a, i64, false>), + ListView(VariantToListArrowRowBuilder<'a, i32, true>), + LargeListView(VariantToListArrowRowBuilder<'a, i64, true>), +} - let mut builder = match data_type { - // If no data type was requested, build an unshredded VariantArray. - None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( - metadata.clone(), - capacity, - )), - Some(DataType::Struct(_)) => { - return Err(ArrowError::NotYetImplemented( - "Converting unshredded variant objects to arrow structs".to_string(), - )); - } - Some( - DataType::List(_) - | DataType::LargeList(_) - | DataType::ListView(_) - | DataType::LargeListView(_) - | DataType::FixedSizeList(..), - ) => { - return Err(ArrowError::NotYetImplemented( - "Converting unshredded variant arrays to arrow lists".to_string(), - )); +impl<'a> ArrayVariantToArrowRowBuilder<'a> { + pub(crate) fn try_new( + data_type: &'a DataType, + cast_options: &'a CastOptions, + capacity: usize, + ) -> Result { + use ArrayVariantToArrowRowBuilder::*; + + // Make List/ListView builders without repeating the constructor boilerplate. + macro_rules! make_list_builder { + ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => { + $variant(VariantToListArrowRowBuilder::<$offset, $is_view>::try_new( + $field.clone(), + $field.data_type(), + cast_options, + capacity, + )?) + }; } - Some(data_type) => { - let builder = - make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; - Primitive(builder) + + let builder = match data_type { + DataType::List(field) => make_list_builder!(List, i32, false, field), + DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field), + DataType::ListView(field) => make_list_builder!(ListView, i32, true, field), + DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field), + DataType::FixedSizeList(..) => { + return Err(ArrowError::NotYetImplemented( + "Converting unshredded variant arrays to arrow fixed-size lists".to_string(), + )); + } + other => { + return Err(ArrowError::InvalidArgumentError(format!( + "Casting to {other:?} is not applicable for array Variant types" + ))); + } + }; + Ok(builder) + } + + pub(crate) fn append_null(&mut self) { + match self { + Self::List(builder) => builder.append_null(), + Self::LargeList(builder) => builder.append_null(), + Self::ListView(builder) => builder.append_null(), + Self::LargeListView(builder) => builder.append_null(), } - }; + } - // Wrap with path extraction if needed - if !path.is_empty() { - builder = WithPath(VariantPathRowBuilder { - builder: Box::new(builder), - path, - }) - }; + pub(crate) fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { + match self { + Self::List(builder) => builder.append_value(list), + Self::LargeList(builder) => builder.append_value(list), + Self::ListView(builder) => builder.append_value(list), + Self::LargeListView(builder) => builder.append_value(list), + } + } - Ok(builder) + pub(crate) fn finish(self) -> Result { + match self { + Self::List(builder) => builder.finish(), + Self::LargeList(builder) => builder.finish(), + Self::ListView(builder) => builder.finish(), + Self::LargeListView(builder) => builder.finish(), + } + } } /// A thin wrapper whose only job is to extract a specific path from a variant value and pass the @@ -708,6 +786,102 @@ impl<'a> VariantToUuidArrowRowBuilder<'a> { } } +pub(crate) struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool> +where + O: OffsetSizeTrait + ArrowNativeTypeOp, +{ + field: FieldRef, + offsets: Vec, + element_builder: Box>, + nulls: NullBufferBuilder, + current_offset: O, +} + +impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW> +where + O: OffsetSizeTrait + ArrowNativeTypeOp, +{ + fn try_new( + field: FieldRef, + element_data_type: &'a DataType, + cast_options: &'a CastOptions, + capacity: usize, + ) -> Result { + if capacity >= isize::MAX as usize { + return Err(ArrowError::ComputeError( + "Capacity exceeds isize::MAX when reserving list offsets".to_string(), + )); + } + let mut offsets = Vec::with_capacity(capacity + 1); + offsets.push(O::ZERO); + let element_builder = make_variant_to_shredded_variant_arrow_row_builder( + element_data_type, + cast_options, + capacity, + false, + )?; + Ok(Self { + field, + offsets, + element_builder: Box::new(element_builder), + nulls: NullBufferBuilder::new(capacity), + current_offset: O::ZERO, + }) + } + + fn append_null(&mut self) { + self.offsets.push(self.current_offset); + self.nulls.append_null(); + } + + fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { + for element in list.iter() { + self.element_builder.append_value(element)?; + self.current_offset = self.current_offset.add_checked(O::ONE)?; + } + self.offsets.push(self.current_offset); + self.nulls.append_non_null(); + Ok(()) + } + + fn finish(mut self) -> Result { + let (value, typed_value, nulls) = self.element_builder.finish()?; + let element_array = + ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); + let field = Arc::new( + self.field + .as_ref() + .clone() + .with_data_type(element_array.data_type().clone()), + ); + + if IS_VIEW { + // NOTE: `offsets` is never empty (constructor pushes an entry) + let mut sizes = Vec::with_capacity(self.offsets.len() - 1); + for i in 1..self.offsets.len() { + sizes.push(self.offsets[i] - self.offsets[i - 1]); + } + self.offsets.pop(); + let list_view_array = GenericListViewArray::::new( + field, + ScalarBuffer::from(self.offsets), + ScalarBuffer::from(sizes), + ArrayRef::from(element_array), + self.nulls.finish(), + ); + Ok(Arc::new(list_view_array)) + } else { + let list_array = GenericListArray::::new( + field, + OffsetBuffer::::new(ScalarBuffer::from(self.offsets)), + ArrayRef::from(element_array), + self.nulls.finish(), + ); + Ok(Arc::new(list_array)) + } + } +} + /// Builder for creating VariantArray output (for path extraction without type conversion) pub(crate) struct VariantToBinaryVariantArrowRowBuilder { metadata: BinaryViewArray, diff --git a/parquet-variant-json/src/from_json.rs b/parquet-variant-json/src/from_json.rs index 33e1b2e6db9b..4c22785ef106 100644 --- a/parquet-variant-json/src/from_json.rs +++ b/parquet-variant-json/src/from_json.rs @@ -102,7 +102,7 @@ fn variant_from_number<'m, 'v>(n: &Number) -> Result, ArrowError } } -fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { +pub fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { match json { Value::Null => builder.append_value(Variant::Null), Value::Bool(b) => builder.append_value(*b), diff --git a/parquet-variant-json/src/lib.rs b/parquet-variant-json/src/lib.rs index f24c740818be..6b42b15bd480 100644 --- a/parquet-variant-json/src/lib.rs +++ b/parquet-variant-json/src/lib.rs @@ -34,5 +34,5 @@ mod from_json; mod to_json; -pub use from_json::JsonToVariant; +pub use from_json::{JsonToVariant, append_json}; pub use to_json::VariantToJson; diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 7094d935a5eb..e6122f062c38 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -14,7 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::decoder::{OffsetSizeBytes, VariantBasicType, VariantPrimitiveType}; use crate::{ ShortString, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantList, VariantMetadata, VariantObject, @@ -43,21 +43,15 @@ fn short_string_header(len: usize) -> u8 { (len as u8) << 2 | VariantBasicType::ShortString as u8 } -pub(crate) fn int_size(v: usize) -> u8 { +pub(crate) fn int_size(v: usize) -> OffsetSizeBytes { match v { - 0..=0xFF => 1, - 0x100..=0xFFFF => 2, - 0x10000..=0xFFFFFF => 3, - _ => 4, + 0..=0xFF => OffsetSizeBytes::One, + 0x100..=0xFFFF => OffsetSizeBytes::Two, + 0x10000..=0xFFFFFF => OffsetSizeBytes::Three, + _ => OffsetSizeBytes::Four, } } -/// Write little-endian integer to buffer at a specific position -fn write_offset_at_pos(buf: &mut [u8], start_pos: usize, value: usize, nbytes: u8) { - let bytes = value.to_le_bytes(); - buf[start_pos..start_pos + nbytes as usize].copy_from_slice(&bytes[..nbytes as usize]); -} - /// Wrapper around a `Vec` that provides methods for appending /// primitive values, variant types, and metadata. /// @@ -358,63 +352,6 @@ impl ValueBuilder { ); state.finish(); } - - /// Writes out the header byte for a variant object or list, from the starting position - /// of the builder, will return the position after this write - pub(crate) fn append_header_start_from_buf_pos( - &mut self, - start_pos: usize, // the start position where the header will be inserted - header_byte: u8, - is_large: bool, - num_fields: usize, - ) -> usize { - let buffer = self.inner_mut(); - - // Write header at the original start position - let mut header_pos = start_pos; - - // Write header byte - buffer[header_pos] = header_byte; - header_pos += 1; - - // Write number of fields - if is_large { - buffer[header_pos..header_pos + 4].copy_from_slice(&(num_fields as u32).to_le_bytes()); - header_pos += 4; - } else { - buffer[header_pos] = num_fields as u8; - header_pos += 1; - } - - header_pos - } - - /// Writes out the offsets for an array of offsets, including the final offset (data size). - /// from the starting position of the buffer, will return the position after this write - pub(crate) fn append_offset_array_start_from_buf_pos( - &mut self, - start_pos: usize, - offsets: impl IntoIterator, - data_size: Option, - nbytes: u8, - ) -> usize { - let buf = self.inner_mut(); - - let mut current_pos = start_pos; - for relative_offset in offsets { - write_offset_at_pos(buf, current_pos, relative_offset, nbytes); - current_pos += nbytes as usize; - } - - // Write data_size - if let Some(data_size) = data_size { - // Write data_size at the end of the offsets - write_offset_at_pos(buf, current_pos, data_size, nbytes); - current_pos += nbytes as usize; - } - - current_pos - } } /// A trait for managing state specific to different builder types. diff --git a/parquet-variant/src/builder/list.rs b/parquet-variant/src/builder/list.rs index 4c2682c50ac4..5064904ca7de 100644 --- a/parquet-variant/src/builder/list.rs +++ b/parquet-variant/src/builder/list.rs @@ -174,7 +174,7 @@ impl<'a, S: BuilderSpecificState> ListBuilder<'a, S> { // Make sure to reserve enough capacity to handle the extra bytes we'll truncate. let mut bytes_to_splice = Vec::with_capacity(header_size + 3); // Write header - let header = array_header(is_large, offset_size); + let header = array_header(is_large, offset_size as _); bytes_to_splice.push(header); append_packed_u32(&mut bytes_to_splice, num_elements as u32, num_elements_size); diff --git a/parquet-variant/src/builder/metadata.rs b/parquet-variant/src/builder/metadata.rs index 10163ba3e0cf..efccc2e4c63e 100644 --- a/parquet-variant/src/builder/metadata.rs +++ b/parquet-variant/src/builder/metadata.rs @@ -206,7 +206,7 @@ impl WritableMetadataBuilder { // Determine appropriate offset size based on the larger of dict size or total string size let max_offset = std::cmp::max(total_dict_size, nkeys); - let offset_size = int_size(max_offset); + let offset_size = int_size(max_offset) as u8; let offset_start = 1 + offset_size as usize; let string_start = offset_start + (nkeys + 1) * offset_size as usize; diff --git a/parquet-variant/src/builder/object.rs b/parquet-variant/src/builder/object.rs index ab04360c16a7..876c2e2d4c7c 100644 --- a/parquet-variant/src/builder/object.rs +++ b/parquet-variant/src/builder/object.rs @@ -24,14 +24,50 @@ use crate::{ use arrow_schema::ArrowError; use indexmap::IndexMap; -fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { - let large_bit = if large { 1 } else { 0 }; - (large_bit << (BASIC_TYPE_BITS + 4)) - | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) - | ((offset_size - 1) << BASIC_TYPE_BITS) +fn object_header() -> u8 { + (LARGE_BIT << (BASIC_TYPE_BITS + 4)) + | ((ID_SIZE - 1) << (BASIC_TYPE_BITS + 2)) + | ((OFFSET_SIZE - 1) << BASIC_TYPE_BITS) | VariantBasicType::Object as u8 } +struct ObjectHeaderWriter(); + +impl ObjectHeaderWriter { + fn write( + dst: &mut Vec, + num_fields: usize, + field_ids: impl Iterator, + offsets: impl Iterator, + data_size: usize, + ) { + let is_large = num_fields > u8::MAX as usize; + // num_fields will consume 4 bytes when it is larger than u8::MAX + if is_large { + dst.push(object_header::<1, { ID_SIZE }, { OFFSET_SIZE }>()); + append_packed_u32::<4>(dst, num_fields); + } else { + dst.push(object_header::<0, { ID_SIZE }, { OFFSET_SIZE }>()); + append_packed_u32::<1>(dst, num_fields); + } + + for id in field_ids { + append_packed_u32::(dst, id as usize); + } + + for off in offsets { + append_packed_u32::(dst, off); + } + + append_packed_u32::(dst, data_size); + } +} + +#[inline(always)] +fn append_packed_u32(dest: &mut Vec, value: usize) { + dest.extend_from_slice(&value.to_le_bytes()[..SIZE as usize]); +} + /// A builder for creating [`Variant::Object`] values. /// /// See the examples on [`VariantBuilder`] for usage. @@ -245,41 +281,45 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> { (num_fields * id_size as usize) + // field IDs ((num_fields + 1) * offset_size as usize); // field offsets + data_size + let mut bytes_to_splice = Vec::with_capacity(header_size); + + macro_rules! write_header { + ($offset_size:expr, $id_size:expr) => { + ObjectHeaderWriter::<{ $offset_size as u8 }, { $id_size as u8 }>::write( + &mut bytes_to_splice, + num_fields, + self.fields.keys().copied(), + self.fields.values().copied(), + data_size, + ) + }; + } + + use crate::decoder::OffsetSizeBytes::*; + match (offset_size, id_size) { + (One, One) => write_header!(One, One), + (One, Two) => write_header!(One, Two), + (One, Three) => write_header!(One, Three), + (One, Four) => write_header!(One, Four), + (Two, One) => write_header!(Two, One), + (Two, Two) => write_header!(Two, Two), + (Two, Three) => write_header!(Two, Three), + (Two, Four) => write_header!(Two, Four), + (Three, One) => write_header!(Three, One), + (Three, Two) => write_header!(Three, Two), + (Three, Three) => write_header!(Three, Three), + (Three, Four) => write_header!(Three, Four), + (Four, One) => write_header!(Four, One), + (Four, Two) => write_header!(Four, Two), + (Four, Three) => write_header!(Four, Three), + (Four, Four) => write_header!(Four, Four), + } + // Shift existing data to make room for the header - value_builder.inner_mut().splice( - starting_offset..starting_offset, - std::iter::repeat_n(0u8, header_size), - ); + value_builder + .inner_mut() + .splice(starting_offset..starting_offset, bytes_to_splice); - // Write header at the original start position - let mut header_pos = starting_offset; - - // Write header byte - let header = object_header(is_large, id_size, offset_size); - - header_pos = self - .parent_state - .value_builder() - .append_header_start_from_buf_pos(header_pos, header, is_large, num_fields); - - header_pos = self - .parent_state - .value_builder() - .append_offset_array_start_from_buf_pos( - header_pos, - self.fields.keys().copied().map(|id| id as usize), - None, - id_size, - ); - - self.parent_state - .value_builder() - .append_offset_array_start_from_buf_pos( - header_pos, - self.fields.values().copied(), - Some(data_size), - offset_size, - ); self.parent_state.finish(); } } diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs index e222c3ac9ccb..2aeb9df97d82 100644 --- a/parquet-variant/src/path.rs +++ b/parquet-variant/src/path.rs @@ -16,6 +16,8 @@ // under the License. use std::{borrow::Cow, ops::Deref}; +use crate::utils::parse_path; + /// Represents a qualified path to a potential subfield or index of a variant /// value. /// @@ -112,11 +114,7 @@ impl<'a> From>> for VariantPath<'a> { /// Create from &str with support for dot notation impl<'a> From<&'a str> for VariantPath<'a> { fn from(path: &'a str) -> Self { - if path.is_empty() { - VariantPath::new(vec![]) - } else { - VariantPath::new(path.split('.').map(Into::into).collect()) - } + VariantPath::new(path.split(".").flat_map(parse_path).collect()) } } @@ -223,4 +221,35 @@ mod tests { let path = VariantPath::from_iter([p]); assert!(!path.is_empty()); } + + #[test] + fn test_variant_path_dot_notation_with_array_index() { + let path = VariantPath::from("city.store.books[3].title"); + + let expected = VariantPath::from("city") + .join("store") + .join("books") + .join(3) + .join("title"); + + assert_eq!(path, expected); + } + + #[test] + fn test_variant_path_dot_notation_with_only_array_index() { + let path = VariantPath::from("[3]"); + + let expected = VariantPath::from(3); + + assert_eq!(path, expected); + } + + #[test] + fn test_variant_path_dot_notation_with_starting_array_index() { + let path = VariantPath::from("[3].title"); + + let expected = VariantPath::from(3).join("title"); + + assert_eq!(path, expected); + } } diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index d28b8685baa2..6accbcb36649 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -16,6 +16,7 @@ // under the License. use std::{array::TryFromSliceError, ops::Range, str}; +use crate::VariantPathElement; use arrow_schema::ArrowError; use std::cmp::Ordering; @@ -149,6 +150,38 @@ pub(crate) fn fits_precision(n: impl Into) -> bool { n.into().unsigned_abs().leading_zeros() >= (i64::BITS - N) } +// Helper fn to parse input segments like foo[0] or foo[0][0] +#[inline] +pub(crate) fn parse_path<'a>(segment: &'a str) -> Vec> { + if segment.is_empty() { + return Vec::new(); + } + + let mut path_elements = Vec::new(); + let mut base = segment; + + while let Some(stripped) = base.strip_suffix(']') { + let Some(open_pos) = stripped.rfind('[') else { + return vec![VariantPathElement::field(segment)]; + }; + + let index_str = &stripped[open_pos + 1..]; + let Ok(index) = index_str.parse::() else { + return vec![VariantPathElement::field(segment)]; + }; + + path_elements.push(VariantPathElement::index(index)); + base = &stripped[..open_pos]; + } + + if !base.is_empty() { + path_elements.push(VariantPathElement::field(base)); + } + + path_elements.reverse(); + path_elements +} + #[cfg(test)] mod test { use super::*; diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index c962a4c3fdf8..6f5f56745e90 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use parquet::basic::{Encoding, PageType, Type as PhysicalType}; use parquet::file::metadata::{ ColumnChunkMetaData, FileMetaData, PageEncodingStats, ParquetMetaData, ParquetMetaDataOptions, - ParquetMetaDataReader, ParquetMetaDataWriter, ParquetStatisticsPolicy, RowGroupMetaData, + ParquetMetaDataReader, ParquetMetaDataWriter, RowGroupMetaData, }; use parquet::file::statistics::Statistics; use parquet::file::writer::TrackedWrite; @@ -164,26 +164,17 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let schema = ParquetMetaDataReader::decode_schema(&meta_data).unwrap(); - let options = ParquetMetaDataOptions::new().with_schema(schema); - c.bench_function("decode metadata with schema", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) - .unwrap(); - }) - }); - - let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true); - c.bench_function("decode metadata with stats mask", |b| { + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); + c.bench_function("decode metadata (full stats)", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) .unwrap(); }) }); - let options = - ParquetMetaDataOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll); - c.bench_function("decode metadata with skip PES", |b| { + let schema = ParquetMetaDataReader::decode_schema(&meta_data).unwrap(); + let options = ParquetMetaDataOptions::new().with_schema(schema); + c.bench_function("decode metadata with schema", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) .unwrap(); @@ -197,24 +188,16 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let schema = ParquetMetaDataReader::decode_schema(&buf).unwrap(); - let options = ParquetMetaDataOptions::new().with_schema(schema); - c.bench_function("decode metadata (wide) with schema", |b| { + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); + c.bench_function("decode metadata (wide) (full stats)", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) }); - let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true); - c.bench_function("decode metadata (wide) with stats mask", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); - }) - }); - - let options = - ParquetMetaDataOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll); - c.bench_function("decode metadata (wide) with skip PES", |b| { + let schema = ParquetMetaDataReader::decode_schema(&buf).unwrap(); + let options = ParquetMetaDataOptions::new().with_schema(schema); + c.bench_function("decode metadata (wide) with schema", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) diff --git a/parquet/src/arrow/array_reader/cached_array_reader.rs b/parquet/src/arrow/array_reader/cached_array_reader.rs index a2fa0e903599..b55b1e1d1a65 100644 --- a/parquet/src/arrow/array_reader/cached_array_reader.rs +++ b/parquet/src/arrow/array_reader/cached_array_reader.rs @@ -201,7 +201,7 @@ impl ArrayReader for CachedArrayReader { // Check local cache first let cached = if let Some(array) = self.local_cache.get(&batch_id) { - Some(array.clone()) + Some(Arc::clone(array)) } else { // If not in local cache, i.e., we are consumer, check shared cache let cache_content = self @@ -211,7 +211,7 @@ impl ArrayReader for CachedArrayReader { .get(self.column_idx, batch_id); if let Some(array) = cache_content.as_ref() { // Store in local cache for later use in consume_batch - self.local_cache.insert(batch_id, array.clone()); + self.local_cache.insert(batch_id, Arc::clone(array)); } cache_content }; diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 8df6a25c9102..b4a6a375334f 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -129,8 +129,8 @@ impl ArrayReader for StructArrayReader { .len(children_array_len) .child_data( children_array - .iter() - .map(|x| x.to_data()) + .into_iter() + .map(|x| x.into_data()) .collect::>(), ); diff --git a/parquet/src/arrow/arrow_reader/read_plan.rs b/parquet/src/arrow/arrow_reader/read_plan.rs index 3c17a358f084..7c9eb36befe3 100644 --- a/parquet/src/arrow/arrow_reader/read_plan.rs +++ b/parquet/src/arrow/arrow_reader/read_plan.rs @@ -110,19 +110,22 @@ impl ReadPlanBuilder { None => return RowSelectionStrategy::Selectors, }; - let trimmed = selection.clone().trim(); - let selectors: Vec = trimmed.into(); - if selectors.is_empty() { - return RowSelectionStrategy::Mask; - } - - let total_rows: usize = selectors.iter().map(|s| s.row_count).sum(); - let selector_count = selectors.len(); - if selector_count == 0 { + // total_rows: total number of rows selected / skipped + // effective_count: number of non-empty selectors + let (total_rows, effective_count) = + selection.iter().fold((0usize, 0usize), |(rows, count), s| { + if s.row_count > 0 { + (rows + s.row_count, count + 1) + } else { + (rows, count) + } + }); + + if effective_count == 0 { return RowSelectionStrategy::Mask; } - if total_rows < selector_count.saturating_mul(threshold) { + if total_rows < effective_count.saturating_mul(threshold) { RowSelectionStrategy::Mask } else { RowSelectionStrategy::Selectors diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 3c283bcbe3d2..59bf6c602438 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -550,8 +550,8 @@ impl LevelInfoBuilder { /// and the other is a native array, the dictionary values must have the same type as the /// native array fn types_compatible(a: &DataType, b: &DataType) -> bool { - // if the Arrow data types are the same, the types are clearly compatible - if a == b { + // if the Arrow data types are equal, the types are deemed compatible + if a.equals_datatype(b) { return true; } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 3e3c9108d59c..a6cd2006782f 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1522,11 +1522,12 @@ fn get_fsb_array_slice( #[cfg(test)] mod tests { use super::*; + use std::collections::HashMap; use std::fs::File; - use crate::arrow::ARROW_SCHEMA_META_KEY; use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; + use crate::arrow::{ARROW_SCHEMA_META_KEY, PARQUET_FIELD_ID_META_KEY}; use crate::column::page::{Page, PageReader}; use crate::file::metadata::thrift::PageHeader; use crate::file::page_index::column_index::ColumnIndexMetaData; @@ -1539,7 +1540,7 @@ mod tests { use arrow::util::data_gen::create_random_array; use arrow::util::pretty::pretty_format_batches; use arrow::{array::*, buffer::Buffer}; - use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, i256}; + use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, OffsetBuffer, i256}; use arrow_schema::Fields; use half::f16; use num_traits::{FromPrimitive, ToPrimitive}; @@ -3323,6 +3324,52 @@ mod tests { BinaryViewArray::from_iter_values(vec![b"barquet"]), LargeBinaryArray::from_iter_values(vec![b"parquet", b"barquet"]), ); + + // check compatibility for list types + + let list_field_metadata = HashMap::from_iter(vec![( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )]); + let list_field = Field::new_list_field(DataType::Int32, false); + + let values1 = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4])); + let offsets1 = OffsetBuffer::new(vec![0, 2, 5].into()); + + let values2 = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9])); + let offsets2 = OffsetBuffer::new(vec![0, 3, 5].into()); + + let values_expected = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9])); + let offsets_expected = OffsetBuffer::new(vec![0, 2, 5, 8, 10].into()); + + ensure_compatible_write( + // when the initial schema has the metadata ... + ListArray::try_new( + Arc::new( + list_field + .clone() + .with_metadata(list_field_metadata.clone()), + ), + offsets1, + values1, + None, + ) + .unwrap(), + // ... and some intermediate schema doesn't have the metadata + ListArray::try_new(Arc::new(list_field.clone()), offsets2, values2, None).unwrap(), + // ... the write will still go through, and the resulting schema will inherit the initial metadata + ListArray::try_new( + Arc::new( + list_field + .clone() + .with_metadata(list_field_metadata.clone()), + ), + offsets_expected, + values_expected, + None, + ) + .unwrap(), + ); } #[test] @@ -4433,7 +4480,10 @@ mod tests { .unwrap(); // check that the read metadata is also correct - let options = ReadOptionsBuilder::new().with_page_index().build(); + let options = ReadOptionsBuilder::new() + .with_page_index() + .with_encoding_stats_as_mask(false) + .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let rowgroup = reader.get_row_group(0).expect("row group missing"); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 672ffb6fc521..52152988166f 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -494,7 +494,9 @@ pub fn parquet_column<'a>( #[cfg(test)] mod test { use crate::arrow::ArrowWriter; - use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetMetaDataWriter}; + use crate::file::metadata::{ + ParquetMetaData, ParquetMetaDataOptions, ParquetMetaDataReader, ParquetMetaDataWriter, + }; use crate::file::properties::{EnabledStatistics, WriterProperties}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; @@ -511,13 +513,17 @@ mod test { let parquet_bytes = create_parquet_file(); // read the metadata from the file WITHOUT the page index structures + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let original_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .parse_and_finish(&parquet_bytes) .unwrap(); // this should error because the page indexes are not present, but have offsets specified let metadata_bytes = metadata_to_bytes(&original_metadata); + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let err = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .with_page_indexes(true) // there are no page indexes in the metadata .parse_and_finish(&metadata_bytes) .err() @@ -533,7 +539,9 @@ mod test { let parquet_bytes = create_parquet_file(); // read the metadata from the file + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let original_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .parse_and_finish(&parquet_bytes) .unwrap(); @@ -545,7 +553,9 @@ mod test { "metadata is subset of parquet" ); + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let roundtrip_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .parse_and_finish(&metadata_bytes) .unwrap(); @@ -559,14 +569,18 @@ mod test { // read the metadata from the file including the page index structures // (which are stored elsewhere in the footer) + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let original_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .with_page_indexes(true) .parse_and_finish(&parquet_bytes) .unwrap(); // read metadata back from the serialized bytes and ensure it is the same let metadata_bytes = metadata_to_bytes(&original_metadata); + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let roundtrip_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .with_page_indexes(true) .parse_and_finish(&metadata_bytes) .unwrap(); diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 6bd426ee677f..ca3a9e10978b 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1062,6 +1062,10 @@ impl ColumnChunkMetaData { /// Returns the page encoding statistics, or `None` if no page encoding statistics /// are available (or they were converted to a mask). + /// + /// Note: By default, this crate converts page encoding statistics to a mask for performance + /// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`] + /// to `false`. pub fn page_encoding_stats(&self) -> Option<&Vec> { match self.encoding_stats.as_ref() { Some(ParquetPageEncodingStats::Full(stats)) => Some(stats), @@ -1072,6 +1076,8 @@ impl ColumnChunkMetaData { /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are /// not available (or they were left in their original form). /// + /// Note: This is the default behavior for this crate. + /// /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to /// enable fast determination of whether all pages in a column chunk are dictionary encoded /// (see ). @@ -1667,7 +1673,9 @@ impl OffsetIndexBuilder { mod tests { use super::*; use crate::basic::{PageType, SortOrder}; - use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group}; + use crate::file::metadata::thrift::tests::{ + read_column_chunk, read_column_chunk_with_options, read_row_group, + }; #[test] fn test_row_group_metadata_thrift_conversion() { @@ -1822,7 +1830,72 @@ mod tests { let mut buf = Vec::new(); let mut writer = ThriftCompactOutputProtocol::new(&mut buf); col_metadata.write_thrift(&mut writer).unwrap(); - let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap(); + let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap(); + + let expected_metadata = ColumnChunkMetaData::builder(column_descr) + .set_encodings_mask(EncodingMask::new_from_encodings( + [Encoding::PLAIN, Encoding::RLE].iter(), + )) + .set_file_path("file_path".to_owned()) + .set_num_values(1000) + .set_compression(Compression::SNAPPY) + .set_total_compressed_size(2000) + .set_total_uncompressed_size(3000) + .set_data_page_offset(4000) + .set_dictionary_page_offset(Some(5000)) + .set_page_encoding_stats_mask(EncodingMask::new_from_encodings( + [Encoding::PLAIN, Encoding::RLE].iter(), + )) + .set_bloom_filter_offset(Some(6000)) + .set_bloom_filter_length(Some(25)) + .set_offset_index_offset(Some(7000)) + .set_offset_index_length(Some(25)) + .set_column_index_offset(Some(8000)) + .set_column_index_length(Some(25)) + .set_unencoded_byte_array_data_bytes(Some(2000)) + .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100]))) + .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200]))) + .build() + .unwrap(); + + assert_eq!(col_chunk_res, expected_metadata); + } + + #[test] + fn test_column_chunk_metadata_thrift_conversion_full_stats() { + let column_descr = get_test_schema_descr().column(0); + let stats = vec![ + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::PLAIN, + count: 3, + }, + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::RLE, + count: 5, + }, + ]; + let col_metadata = ColumnChunkMetaData::builder(column_descr.clone()) + .set_encodings_mask(EncodingMask::new_from_encodings( + [Encoding::PLAIN, Encoding::RLE].iter(), + )) + .set_num_values(1000) + .set_compression(Compression::SNAPPY) + .set_total_compressed_size(2000) + .set_total_uncompressed_size(3000) + .set_data_page_offset(4000) + .set_page_encoding_stats(stats) + .build() + .unwrap(); + + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + col_metadata.write_thrift(&mut writer).unwrap(); + + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); + let col_chunk_res = + read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap(); assert_eq!(col_chunk_res, col_metadata); } diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index c1ee22ff8de9..0bd0dfd9e30a 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -87,13 +87,23 @@ impl ParquetStatisticsPolicy { /// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData /// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader /// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder -#[derive(Default, Debug, Clone)] +#[derive(Debug, Clone)] pub struct ParquetMetaDataOptions { schema_descr: Option, encoding_stats_as_mask: bool, encoding_stats_policy: ParquetStatisticsPolicy, } +impl Default for ParquetMetaDataOptions { + fn default() -> Self { + Self { + schema_descr: None, + encoding_stats_as_mask: true, + encoding_stats_policy: ParquetStatisticsPolicy::KeepAll, + } + } +} + impl ParquetMetaDataOptions { /// Return a new default [`ParquetMetaDataOptions`]. pub fn new() -> Self { @@ -118,7 +128,7 @@ impl ParquetMetaDataOptions { } /// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData` - /// as a bitmask (defaults to `false`). + /// as a bitmask (defaults to `true`). /// /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this /// might be desirable. @@ -193,6 +203,12 @@ mod tests { }; use std::{io::Read, sync::Arc}; + #[test] + fn test_options_default() { + let options = ParquetMetaDataOptions::default(); + assert!(options.encoding_stats_as_mask()); + } + #[test] fn test_provide_schema() { let mut buf: Vec = Vec::new(); diff --git a/parquet/src/file/metadata/thrift/mod.rs b/parquet/src/file/metadata/thrift/mod.rs index 95ad67da6d95..154fde77edb9 100644 --- a/parquet/src/file/metadata/thrift/mod.rs +++ b/parquet/src/file/metadata/thrift/mod.rs @@ -410,7 +410,7 @@ fn read_column_metadata<'a>( let mut seen_mask = 0u16; let mut skip_pes = false; - let mut pes_mask = false; + let mut pes_mask = true; if let Some(opts) = options { skip_pes = opts.skip_encoding_stats(col_index); @@ -1704,7 +1704,7 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct); pub(crate) mod tests { use crate::errors::Result; use crate::file::metadata::thrift::{BoundingBox, SchemaElement, write_schema}; - use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; + use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions, RowGroupMetaData}; use crate::parquet_thrift::tests::test_roundtrip; use crate::parquet_thrift::{ ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, read_thrift_vec, @@ -1726,9 +1726,17 @@ pub(crate) mod tests { pub(crate) fn read_column_chunk( buf: &mut [u8], column_descr: Arc, + ) -> Result { + read_column_chunk_with_options(buf, column_descr, None) + } + + pub(crate) fn read_column_chunk_with_options( + buf: &mut [u8], + column_descr: Arc, + options: Option<&ParquetMetaDataOptions>, ) -> Result { let mut reader = ThriftSliceInputProtocol::new(buf); - crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, None) + crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, options) } pub(crate) fn roundtrip_schema(schema: TypePtr) -> Result { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8ef7b972d7e1..68b44f3cbbde 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1855,7 +1855,10 @@ mod tests { fn test_file_reader_optional_metadata() { // file with optional metadata: bloom filters, encoding stats, column index and offset index. let file = get_test_file("data_index_bloom_encoding_stats.parquet"); - let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); + let options = ReadOptionsBuilder::new() + .with_encoding_stats_as_mask(false) + .build(); + let file_reader = Arc::new(SerializedFileReader::new_with_options(file, options).unwrap()); let row_group_metadata = file_reader.metadata.row_group(0); let col0_metadata = row_group_metadata.column(0);