diff --git a/.env b/.env index 3c146239ef2..cd6b57e004a 100644 --- a/.env +++ b/.env @@ -69,5 +69,6 @@ R_TAG=latest # -1 does not attempt to install a devtoolset version, any positive integer will install devtoolset-n DEVTOOLSET_VERSION=-1 -# Used for the manylinux and windows wheels +# Used for the manylinux and windows wheels, please update the crossbow configuration on update: +# https://github.com/ursacomputing/crossbow/blob/master/.github/workflows/cache_vcpkg.yml VCPKG=fced4bef1606260f110d74de1ae1975c2b9ac549 diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8869de77b34..0bdecac2d6d 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -20,24 +20,24 @@ name: R on: push: paths: - - '.github/workflows/r.yml' - - 'ci/scripts/r_*.sh' - - 'ci/scripts/cpp_*.sh' - - 'ci/scripts/PKGBUILD' - - 'ci/etc/rprofile' - - 'ci/docker/**' - - 'cpp/**' - - 'r/**' + - ".github/workflows/r.yml" + - "ci/scripts/r_*.sh" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/PKGBUILD" + - "ci/etc/rprofile" + - "ci/docker/**" + - "cpp/**" + - "r/**" pull_request: paths: - - '.github/workflows/r.yml' - - 'ci/scripts/r_*.sh' - - 'ci/scripts/cpp_*.sh' - - 'ci/scripts/PKGBUILD' - - 'ci/etc/rprofile' - - 'ci/docker/**' - - 'cpp/**' - - 'r/**' + - ".github/workflows/r.yml" + - "ci/scripts/r_*.sh" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/PKGBUILD" + - "ci/etc/rprofile" + - "ci/docker/**" + - "cpp/**" + - "r/**" env: DOCKER_VOLUME_PREFIX: ".docker/" @@ -86,6 +86,15 @@ jobs: - name: Dump install logs run: cat r/check/arrow.Rcheck/00install.out if: always() + - name: Dump test logs + run: cat r/check/arrow.Rcheck/tests/testthat.Rout* + if: always() + - name: Save the test output + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-output + path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true @@ -99,8 +108,8 @@ jobs: fail-fast: false matrix: config: - - {org: 'rstudio', image: 'r-base', tag: '4.0-centos7'} - - {org: 'rhub', image: 'debian-gcc-devel', tag: 'latest'} + - { org: "rstudio", image: "r-base", tag: "4.0-centos7" } + - { org: "rhub", image: "debian-gcc-devel", tag: "latest" } env: R_ORG: ${{ matrix.config.org }} R_IMAGE: ${{ matrix.config.image }} @@ -134,6 +143,15 @@ jobs: - name: Dump install logs run: cat r/check/arrow.Rcheck/00install.out if: always() + - name: Dump test logs + run: cat r/check/arrow.Rcheck/tests/testthat.Rout* + if: always() + - name: Save the test output + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-output + path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true @@ -149,7 +167,7 @@ jobs: rtools: [35, 40] env: TEST_R_WITH_ARROW: "TRUE" - ARROW_R_CXXFLAGS: '-Werror' + ARROW_R_CXXFLAGS: "-Werror" _R_CHECK_TESTS_NLINES_: 0 steps: - run: git config --global core.autocrlf false @@ -187,13 +205,13 @@ jobs: - uses: r-lib/actions/setup-r@master with: rtools-version: 40 - r-version: '4.0' + r-version: "4.0" Ncpus: 2 - uses: r-lib/actions/setup-r@master if: ${{ matrix.rtools == 35 }} with: rtools-version: 35 - r-version: '3.6' + r-version: "3.6" Ncpus: 2 - name: Build Arrow C++ shell: bash @@ -221,7 +239,8 @@ jobs: build_args = '--no-build-vignettes', args = c('--no-manual', '--as-cran', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', - check_dir = 'check' + check_dir = 'check', + timeout = 3600 ) - name: Dump install logs shell: cmd diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 90831341d6c..1dd220ade94 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -116,8 +116,8 @@ jobs: # run tests on all workspace members with default feature list cargo test # test datafusion examples - cd datafusion - cargo test --no-default-features --features cli + cd datafusion-examples + cargo test --no-default-features cargo run --example csv_sql cargo run --example parquet_sql cd .. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e70eaceaf41..9d2d2d81d68 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,9 +40,10 @@ repos: - id: cmake-format name: CMake Format language: python - entry: bash -c "pip install cmake-format && python run-cmake-format.py --check" - entry: echo - files: ^(.*/CMakeLists.txt|.*.cmake)$ + entry: python run-cmake-format.py + types: [cmake] + additional_dependencies: + - cmake_format==0.5.2 - id: hadolint name: Docker Format language: docker_image diff --git a/c_glib/README.md b/c_glib/README.md index 4ef9612c868..ac179354d8f 100644 --- a/c_glib/README.md +++ b/c_glib/README.md @@ -19,12 +19,14 @@ # Arrow GLib -Arrow GLib is a wrapper library for [Arrow C++](https://github.com/apache/arrow/tree/master/cpp). Arrow GLib provides C -API. +Arrow GLib is a wrapper library for [Arrow +C++](https://github.com/apache/arrow/tree/master/cpp). Arrow GLib +provides C API. -Arrow GLib supports -[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection). -It means that you can create language bindings at runtime or compile time. +Arrow GLib supports [GObject +Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection). +It means that you can create language bindings at runtime or compile +time. For example, you can use Apache Arrow from Ruby by Arrow GLib and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) @@ -50,71 +52,47 @@ gobject-introspection gem based bindings. You can use packages or build by yourself to install Arrow GLib. It's recommended that you use packages. -Note that the packages are "unofficial". "Official" packages will be -released in the future. +We use Meson and Ninja as build tools. If you find problems when +installing please see [common build +problems](https://github.com/apache/arrow/blob/master/c_glib/README.md#common-build-problems). -We support two build systems, GNU Autotools and Meson. If you find problems when installing please see [common build problems](https://github.com/apache/arrow/blob/master/c_glib/README.md#common-build-problems). - -### Package +### Packages See [install document](https://arrow.apache.org/install/) for details. ### How to build by users Arrow GLib users should use released source archive to build Arrow -GLib (replace the version number in the following commands with the one you use): +GLib (replace the version number in the following commands with the +one you use): ```console -% wget https://archive.apache.org/dist/arrow/arrow-0.3.0/apache-arrow-0.3.0.tar.gz -% tar xf apache-arrow-0.3.0.tar.gz -% cd apache-arrow-0.3.0 +% wget https://downloads.apache.org/arrow/arrow-3.0.0/apache-arrow-3.0.0.tar.gz +% tar xf apache-arrow-3.0.0.tar.gz +% cd apache-arrow-3.0.0 ``` You need to build and install Arrow C++ before you build and install Arrow GLib. See Arrow C++ document about how to install Arrow C++. -If you use macOS with [Homebrew](https://brew.sh/), you must install required packages and set `PKG_CONFIG_PATH` before build Arrow GLib: - -If you use GNU Autotools, you can build and install Arrow GLib by the followings: - -macOS: - -```console -% cd c_glib -% brew bundle -% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH -% make -% sudo make install -``` - -Others: - -```console -% cd c_glib -% ./configure -% make -% sudo make install -``` - -If you use Meson, you can build and install Arrow GLib by the followings: +If you use macOS with [Homebrew](https://brew.sh/), you must install +required packages. macOS: ```console -% cd c_glib -% brew bundle -% PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH meson build --buildtype=release -% ninja -C build -% sudo ninja -C build install +$ brew bundle +$ meson setup c_glib.build c_glib --buildtype=release +$ meson compile -C c_glib.build +$ sudo meson install -C c_glib.build ``` Others: ```console -% cd c_glib -% meson build --buildtype=release -% ninja -C build -% sudo ninja -C build install +$ meson setup c_glib.build c_glib --buildtype=release +$ meson compile -C c_glib.build +$ sudo meson install -C build ``` ### How to build by developers @@ -129,51 +107,46 @@ to build Arrow GLib. You can install them by the followings: On Debian GNU/Linux or Ubuntu: ```console -% sudo apt install -y -V gtk-doc-tools autoconf-archive libgirepository1.0-dev meson ninja-build +$ sudo apt install -y -V gtk-doc-tools libgirepository1.0-dev meson ninja-build ``` -On CentOS 7 or later: +On CentOS 7: ```console -% sudo yum install -y gtk-doc gobject-introspection-devel -% sudo pip install -y meson ninja -``` - -On macOS with [Homebrew](https://brew.sh/): - -```text -% brew bundle +$ sudo yum install -y gtk-doc gobject-introspection-devel ninja-build +$ sudo pip3 install meson ``` -If you use GNU Autotools, you can build and install Arrow GLib by the followings: +On CentOS 8 or later: ```console -% cd c_glib -% ./autogen.sh -% ./configure --enable-gtk-doc -% make -% sudo make install +$ sudo dnf install -y --enablerepo=powertools gtk-doc gobject-introspection-devel ninja-build +$ sudo pip3 install meson ``` -You need to set `PKG_CONFIG_PATH` to `configure` On macOS: +On macOS with [Homebrew](https://brew.sh/): ```console -% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH --enable-gtk-doc +$ brew bundle ``` -If you use Meson, you can build and install Arrow GLib by the followings: +You can build and install Arrow GLib by the followings: + +macOS: ```console -% cd c_glib -% meson build -Dgtk_doc=true -% ninja -C build -% sudo ninja -C build install +$ XML_CATALOG_FILES=$(brew --prefix)/etc/xml/catalog +$ meson setup c_glib.build c_glib -Dgtk_doc=true +$ meson compile -C c_glib.build +$ sudo meson install -C c_glib.build ``` -You need to set `PKG_CONFIG_PATH` on macOS: +Others: ```console -% PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH meson build -Dgtk_doc=true +$ meson c_glib.build c_glib -Dgtk_doc=true +$ meson compile -C c_glib.build +$ sudo meson install -C c_glib.build ``` ## Usage @@ -186,7 +159,7 @@ languages, you use GObject Introspection based bindings. You can find API reference in the `/usr/local/share/gtk-doc/html/arrow-glib/` directory. If you specify -`--prefix` to `configure`, the directory will be different. +`--prefix` to `meson`, the directory will be different. You can find example codes in the `example/` directory. @@ -225,101 +198,118 @@ You can install them by the followings: On Debian GNU/Linux or Ubuntu: ```console -% sudo apt install -y -V ruby-dev -% sudo gem install bundler -% (cd c_glib && bundle install) +$ sudo apt install -y -V ruby-dev +$ sudo gem install bundler +$ (cd c_glib && bundle install) ``` On CentOS 7 or later: ```console -% sudo yum install -y git -% git clone https://github.com/sstephenson/rbenv.git ~/.rbenv -% git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build -% echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile -% echo 'eval "$(rbenv init -)"' >> ~/.bash_profile -% exec ${SHELL} --login -% sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel -% rbenv install 2.4.1 -% rbenv global 2.4.1 -% gem install bundler -% (cd c_glib && bundle install) +$ sudo yum install -y git +$ git clone https://github.com/sstephenson/rbenv.git ~/.rbenv +$ git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build +$ echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile +$ echo 'eval "$(rbenv init -)"' >> ~/.bash_profile +$ exec ${SHELL} --login +$ sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel +$ latest_ruby_version=$(rbenv install --list 2>&1 | grep '^[0-9]' | tail -n1) +$ rbenv install ${latest_ruby_version} +$ rbenv global ${latest_ruby_version} +$ gem install bundler +$ (cd c_glib && bundle install) ``` On macOS with [Homebrew](https://brew.sh/): ```console -% (cd c_glib && bundle install) +$ (cd c_glib && bundle install) ``` Now, you can run unit tests by the followings: ```console -% cd c_glib -% bundle exec test/run-test.sh +$ cd c_glib.build +$ bundle exec ../c_glib/test/run-test.sh ``` ## Common build problems -### configure failed - `AX_CXX_COMPILE_STDCXX_11(ext, mandatory)' +### build failed - /usr/bin/ld: cannot find -larrow -* Check whether `autoconf-archive` is installed. -* [macOS] `autoconf-archive` must be linked, but may not be linked. You can check it by running `brew install autoconf-archive` again. If it's not linked, it will show a warning message like: +Arrow C++ must be installed to build Arrow GLib. Run `make install` on +Arrow C++ build directory. In addition, on linux, you may need to run +`sudo ldconfig`. -```console -% brew install autoconf-archive -Warning: autoconf-archive 2017.03.21 is already installed, it's just not linked. -You can use `brew link autoconf-archive` to link this version. -``` +### build failed - unable to load http://docbook.sourceforge.net/release/xsl/current/html/chunk.xsl -In this case, you need to run `brew link autoconf-archive`. It may fail with the following message if you have install conflicted packages (e.g. `gnome-common`). +You need to set the following environment variable on macOS: ```console -% brew link autoconf-archive -Linking /usr/local/Cellar/autoconf-archive/2017.03.21... -Error: Could not symlink share/aclocal/ax_check_enable_debug.m4 -Target /usr/local/share/aclocal/ax_check_enable_debug.m4 -is a symlink belonging to gnome-common. You can unlink it: - brew unlink gnome-common +$ export XML_CATALOG_FILES="$(brew --prefix)/etc/xml/catalog" ``` -You need to run `brew unlink `, then run `brew link autoconf-archive` again. - -After installing/linking `autoconf-archive`, run `./autogen.sh` again. +### build failed - Symbol not found, referenced from `libsource-highlight.4.dylib` -### [macOS] configure failed - gobject-introspection-1.0 is not installed +You may get the following error on macOS: -gobject-introspection requires libffi, and it's automatically installed with gobject-introspection. However it can't be found because it's [keg-only](https://docs.brew.sh/FAQ.html#what-does-keg-only-mean). You need to set `PKG_CONFIG_PATH` when executing configure. -```console -% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig +```text +dyld: Symbol not found: __ZN5boost16re_detail_10650112perl_matcherIPKcNSt3__19allocatorINS_9sub_matchIS3_EEEENS_12regex_traitsIcNS_16cpp_regex_traitsIcEEEEE14construct_initERKNS_11basic_regexIcSC_EENS_15regex_constants12_match_flagsE + Referenced from: /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib + Expected in: flat namespace + in /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib ``` -### build failed - /usr/bin/ld: cannot find -larrow - -Arrow C++ must be installed to build Arrow GLib. Run `make install` on Arrow C++ build directory. In addition, on linux, you may need to run `sudo ldconfig`. - -### build failed - unable to load http://docbook.sourceforge.net/release/xsl/current/html/chunk.xsl - -On macOS you may need to set the following environment variable: +To fix this error, you need to upgrade `source-highlight`: ```console -% export XML_CATALOG_FILES="/usr/local/etc/xml/catalog" +$ brew upgrade source-highlight ``` -### build failed - Symbol not found, referenced from `libsource-highlight.4.dylib` +### test failed - Failed to load shared library '...' referenced by the typelib: dlopen(...): dependent dylib '@rpath/...' not found for '...'. relative file paths not allowed '@rpath/...' -On macOS if you see the following error you may need to upgrade `source-highlight` +You may get the following error on macOS by running test: -```console -dyld: Symbol not found: __ZN5boost16re_detail_10650112perl_matcherIPKcNSt3__19allocatorINS_9sub_matchIS3_EEEENS_12regex_traitsIcNS_16cpp_regex_traitsIcEEEEE14construct_initERKNS_11basic_regexIcSC_EENS_15regex_constants12_match_flagsE - Referenced from: /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib - Expected in: flat namespace - in /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib +```text +(NULL)-WARNING **: Failed to load shared library '/usr/local/lib/libparquet-glib.400.dylib' referenced by the typelib: dlopen(/usr/local/lib/libparquet-glib.400.dylib, 0x0009): dependent dylib '@rpath/libparquet.400.dylib' not found for '/usr/local/lib/libparquet-glib.400.dylib'. relative file paths not allowed '@rpath/libparquet.400.dylib' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:215:in `load_object_info' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:68:in `load_info' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:43:in `block in load' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:34:in `block (2 levels) in each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:33:in `times' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:33:in `block in each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:32:in `each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:32:in `each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:42:in `load' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection.rb:44:in `load' + from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/run-test.rb:60:in `
' +Traceback (most recent call last): + 17: from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/run-test.rb:80:in `
' + 16: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:66:in `run' + 15: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:434:in `run' + 14: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:106:in `block in ' + 13: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:38:in `collect' + 12: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:136:in `add_load_path' + 11: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:43:in `block in collect' + 10: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:43:in `each' + 9: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:46:in `block (2 levels) in collect' + 8: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:85:in `collect_recursive' + 7: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:85:in `each' + 6: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:87:in `block in collect_recursive' + 5: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:112:in `collect_file' + 4: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:136:in `add_load_path' + 3: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:114:in `block in collect_file' + 2: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:114:in `require' + 1: from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/test-extension-data-type.rb:18:in `' +/Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/test-extension-data-type.rb:19:in `': uninitialized constant Arrow::ExtensionArray (NameError) ``` -To fix do: +You can't use `@rpath` in Arrow C++. To fix this error, you need to +build Arrow C++ with `-DARROW_INSTALL_NAME_RPATH=OFF`: ```console -% brew upgrade source-highlight +$ cmake -S cpp -B cpp.build -DARROW_INSTALL_NAME_RPATH=OFF ... +$ cmake --build cpp.build +$ sudo cmake --build cpp.build --target install ``` diff --git a/c_glib/arrow-cuda-glib/Makefile.am b/c_glib/arrow-cuda-glib/Makefile.am deleted file mode 100644 index bcf20bb549e..00000000000 --- a/c_glib/arrow-cuda-glib/Makefile.am +++ /dev/null @@ -1,130 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -if HAVE_ARROW_CUDA -lib_LTLIBRARIES = \ - libarrow-cuda-glib.la - -libarrow_cuda_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CUDA_CFLAGS) \ - $(GARROW_CXXFLAGS) - -libarrow_cuda_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libarrow_cuda_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(ARROW_CUDA_LIBS) - -libarrow_cuda_glib_la_headers = \ - arrow-cuda-glib.h \ - cuda.h - -libarrow_cuda_glib_la_sources = \ - cuda.cpp \ - $(libarrow_cuda_glib_la_headers) - -libarrow_cuda_glib_la_cpp_headers = \ - arrow-cuda-glib.hpp \ - cuda.hpp - -libarrow_cuda_glib_la_SOURCES = \ - $(libarrow_cuda_glib_la_sources) \ - $(libarrow_cuda_glib_la_cpp_headers) - -arrow_cuda_glib_includedir = \ - $(includedir)/arrow-cuda-glib -arrow_cuda_glib_include_HEADERS = \ - $(libarrow_cuda_glib_la_headers) \ - $(libarrow_cuda_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - arrow-cuda-glib.pc - -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_builddir)/../arrow-glib - -ArrowCUDA-1.0.gir: libarrow-cuda-glib.la -ArrowCUDA_1_0_gir_PACKAGES = \ - arrow-glib -ArrowCUDA_1_0_gir_EXPORT_PACKAGES = \ - arrow-cuda-glib -ArrowCUDA_1_0_gir_INCLUDES = \ - Arrow-1.0 -ArrowCUDA_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -ArrowCUDA_1_0_gir_LIBS = -ArrowCUDA_1_0_gir_FILES = \ - $(libarrow_cuda_glib_la_sources) -ArrowCUDA_1_0_gir_SCANNERFLAGS = \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --add-include-path=$(abs_builddir)/../arrow-glib \ - --identifier-prefix=GArrowCUDA \ - --symbol-prefix=garrow_cuda -if OS_MACOS -ArrowCUDA_1_0_gir_LIBS += \ - arrow-glib \ - arrow-cuda-glib -ArrowCUDA_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_builddir)/../arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -ArrowCUDA_1_0_gir_LIBS += \ - $(abs_builddir)/../arrow-glib/libarrow-glib.la \ - libarrow-cuda-glib.la -endif - -INTROSPECTION_GIRS += ArrowCUDA-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in b/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in deleted file mode 100644 index de0ce974c7a..00000000000 --- a/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Arrow CUDA GLib -Description: C API for Apache Arrow CUDA based on GLib -Version: @VERSION@ -Libs: -L${libdir} -larrow-cuda-glib -Cflags: -I${includedir} -Requires: arrow-glib arrow-cuda diff --git a/c_glib/arrow-dataset-glib/Makefile.am b/c_glib/arrow-dataset-glib/Makefile.am deleted file mode 100644 index 81e5aa5f083..00000000000 --- a/c_glib/arrow-dataset-glib/Makefile.am +++ /dev/null @@ -1,136 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -if HAVE_ARROW_DATASET -lib_LTLIBRARIES = \ - libarrow-dataset-glib.la - -libarrow_dataset_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_DATASET_CFLAGS) \ - $(GARROW_CXXFLAGS) - -libarrow_dataset_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libarrow_dataset_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(ARROW_DATASET_LIBS) - -libarrow_dataset_glib_la_headers = \ - arrow-dataset-glib.h \ - file-format.h \ - fragment.h \ - scanner.h - -libarrow_dataset_glib_la_sources = \ - file-format.cpp \ - fragment.cpp \ - scanner.cpp \ - $(libarrow_dataset_glib_la_headers) - -libarrow_dataset_glib_la_cpp_headers = \ - arrow-dataset-glib.hpp \ - file-format.hpp \ - fragment.hpp \ - scanner.hpp - -libarrow_dataset_glib_la_SOURCES = \ - $(libarrow_dataset_glib_la_sources) \ - $(libarrow_dataset_glib_la_cpp_headers) - -arrow_dataset_glib_includedir = \ - $(includedir)/arrow-dataset-glib -arrow_dataset_glib_include_HEADERS = \ - $(libarrow_dataset_glib_la_headers) \ - $(libarrow_dataset_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - arrow-dataset-glib.pc - -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_builddir)/../arrow-glib - -ArrowDataset-1.0.gir: libarrow-dataset-glib.la -ArrowDataset_1_0_gir_PACKAGES = \ - arrow-glib -ArrowDataset_1_0_gir_EXPORT_PACKAGES = \ - arrow-dataset-glib -ArrowDataset_1_0_gir_INCLUDES = \ - Arrow-1.0 -ArrowDataset_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -ArrowDataset_1_0_gir_LIBS = -ArrowDataset_1_0_gir_FILES = \ - $(libarrow_dataset_glib_la_sources) -ArrowDataset_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_builddir)/../arrow-glib \ - --identifier-prefix=GAD \ - --library-path=$(ARROW_LIB_DIR) \ - --symbol-prefix=gad \ - --warn-all -if OS_MACOS -ArrowDataset_1_0_gir_LIBS += \ - arrow-glib \ - arrow-dataset-glib -ArrowDataset_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_builddir)/../arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -ArrowDataset_1_0_gir_LIBS += \ - $(abs_builddir)/../arrow-glib/libarrow-glib.la \ - libarrow-dataset-glib.la -endif - -INTROSPECTION_GIRS += ArrowDataset-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am deleted file mode 100644 index 9f19578d537..00000000000 --- a/c_glib/arrow-glib/Makefile.am +++ /dev/null @@ -1,309 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) - -# libarrow-glib -lib_LTLIBRARIES = \ - libarrow-glib.la - -libarrow_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(GARROW_CXXFLAGS) - -libarrow_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libarrow_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - $(ARROW_LIBS) - -libarrow_glib_la_headers = \ - array.h \ - array-builder.h \ - arrow-glib.h \ - basic-array.h \ - basic-data-type.h \ - buffer.h \ - chunked-array.h \ - codec.h \ - composite-array.h \ - composite-data-type.h \ - data-type.h \ - datum.h \ - decimal.h \ - error.h \ - field.h \ - gobject-type.h \ - record-batch.h \ - schema.h \ - table.h \ - table-builder.h \ - tensor.h \ - type.h - -libarrow_glib_la_headers += \ - file.h \ - file-mode.h \ - input-stream.h \ - output-stream.h \ - readable.h \ - writable.h \ - writable-file.h - -libarrow_glib_la_headers += \ - ipc-options.h \ - metadata-version.h \ - reader.h \ - writer.h - -libarrow_glib_la_headers += \ - compute.h - -libarrow_glib_la_headers += \ - file-system.h \ - local-file-system.h - -if HAVE_ARROW_ORC -libarrow_glib_la_headers += \ - orc-file-reader.h -endif - -libarrow_glib_la_generated_headers = \ - enums.h \ - version.h - -libarrow_glib_la_generated_sources = \ - enums.c \ - $(libarrow_glib_la_generated_headers) - -libarrow_glib_la_sources = \ - array-builder.cpp \ - basic-array.cpp \ - basic-data-type.cpp \ - buffer.cpp \ - chunked-array.cpp \ - codec.cpp \ - composite-array.cpp \ - composite-data-type.cpp \ - datum.cpp \ - decimal.cpp \ - error.cpp \ - field.cpp \ - record-batch.cpp \ - schema.cpp \ - table.cpp \ - table-builder.cpp \ - tensor.cpp \ - type.cpp \ - $(libarrow_glib_la_headers) \ - $(libarrow_glib_la_generated_sources) - -libarrow_glib_la_sources += \ - file.cpp \ - file-mode.cpp \ - input-stream.cpp \ - output-stream.cpp \ - readable.cpp \ - writable.cpp \ - writable-file.cpp - -libarrow_glib_la_sources += \ - ipc-options.cpp \ - metadata-version.cpp \ - reader.cpp \ - writer.cpp - -libarrow_glib_la_sources += \ - compute.cpp - -libarrow_glib_la_sources += \ - file-system.cpp \ - local-file-system.cpp - -if HAVE_ARROW_ORC -libarrow_glib_la_sources += \ - orc-file-reader.cpp -endif - -libarrow_glib_la_cpp_headers = \ - array.hpp \ - array-builder.hpp \ - arrow-glib.hpp \ - basic-array.hpp \ - basic-data-type.hpp \ - buffer.hpp \ - chunked-array.hpp \ - codec.hpp \ - data-type.hpp \ - datum.hpp \ - decimal.hpp \ - error.hpp \ - field.hpp \ - record-batch.hpp \ - schema.hpp \ - table.hpp \ - table-builder.hpp \ - tensor.hpp \ - type.hpp - -libarrow_glib_la_cpp_headers += \ - file.hpp \ - file-mode.hpp \ - input-stream.hpp \ - output-stream.hpp \ - readable.hpp \ - writable.hpp \ - writable-file.hpp - -libarrow_glib_la_cpp_headers += \ - ipc-options.hpp \ - metadata-version.hpp \ - reader.hpp \ - writer.hpp - -libarrow_glib_la_cpp_headers += \ - compute.hpp - -libarrow_glib_la_cpp_headers += \ - file-system.hpp \ - local-file-system.hpp - -if HAVE_ARROW_ORC -libarrow_glib_la_cpp_headers += \ - orc-file-reader.hpp -endif - -libarrow_glib_la_cpp_internal_headers = \ - internal-hash-table.hpp \ - internal-index.hpp - -libarrow_glib_la_SOURCES = \ - $(libarrow_glib_la_sources) \ - $(libarrow_glib_la_cpp_headers) \ - $(libarrow_glib_la_cpp_internal_headers) - -BUILT_SOURCES = \ - $(libarrow_glib_la_generated_headers) \ - $(libarrow_glib_la_generated_sources) \ - stamp-enums.c \ - stamp-enums.h - -DISTCLEANFILES += \ - stamp-enums.c \ - stamp-enums.h - -EXTRA_DIST += \ - enums.c.template \ - enums.h.template - -enums.h: stamp-enums.h - @true -stamp-enums.h: $(libarrow_glib_la_headers) enums.h.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GArrow \ - --symbol-prefix garrow \ - --template enums.h.template \ - $(libarrow_glib_la_headers)) > enums.h - touch $@ - -enums.c: stamp-enums.c - @true -stamp-enums.c: $(libarrow_glib_la_headers) enums.c.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GArrow \ - --symbol-prefix garrow \ - --template enums.c.template \ - $(libarrow_glib_la_headers)) > enums.c - touch $@ - -arrow_glib_includedir = $(includedir)/arrow-glib -arrow_glib_include_HEADERS = \ - $(libarrow_glib_la_headers) \ - $(libarrow_glib_la_cpp_headers) \ - $(libarrow_glib_la_generated_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - arrow-glib.pc - -if HAVE_ARROW_ORC -pkgconfig_DATA += \ - arrow-orc-glib.pc -endif - -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -INTROSPECTION_COMPILER_ARGS = - -Arrow-1.0.gir: libarrow-glib.la -Arrow_1_0_gir_PACKAGES = \ - gio-2.0 -Arrow_1_0_gir_EXPORT_PACKAGES = \ - arrow-glib -Arrow_1_0_gir_INCLUDES = \ - Gio-2.0 -Arrow_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Arrow_1_0_gir_LIBS = -Arrow_1_0_gir_FILES = $(libarrow_glib_la_sources) -Arrow_1_0_gir_SCANNERFLAGS = \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GArrow \ - --symbol-prefix=garrow -if OS_MACOS -Arrow_1_0_gir_LIBS += arrow-glib -Arrow_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_builddir)/.libs -else -Arrow_1_0_gir_LIBS += libarrow-glib.la -endif -INTROSPECTION_GIRS += Arrow-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif diff --git a/c_glib/arrow-glib/arrow-glib.pc.in b/c_glib/arrow-glib/arrow-glib.pc.in deleted file mode 100644 index f9f27b24990..00000000000 --- a/c_glib/arrow-glib/arrow-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Arrow GLib -Description: C API for Apache Arrow based on GLib -Version: @VERSION@ -Libs: -L${libdir} -larrow-glib -Cflags: -I${includedir} -Requires: gobject-2.0 arrow diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index db6fa544069..636a3c74707 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -1479,10 +1479,12 @@ garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options, typedef struct GArrowCSVReaderPrivate_ { std::shared_ptr reader; + GArrowInputStream *input; } GArrowCSVReaderPrivate; enum { - PROP_CSV_TABLE_READER = 1 + PROP_CSV_TABLE_READER = 1, + PROP_CSV_READER_INPUT, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReader, @@ -1499,11 +1501,24 @@ garrow_csv_reader_dispose(GObject *object) { auto priv = GARROW_CSV_READER_GET_PRIVATE(object); - priv->reader = nullptr; + if (priv->input) { + g_object_unref(priv->input); + priv->input = nullptr; + } G_OBJECT_CLASS(garrow_csv_reader_parent_class)->dispose(object); } +static void +garrow_csv_reader_finalize(GObject *object) +{ + auto priv = GARROW_CSV_READER_GET_PRIVATE(object); + + priv->reader.~shared_ptr(); + + G_OBJECT_CLASS(garrow_csv_reader_parent_class)->finalize(object); +} + static void garrow_csv_reader_set_property(GObject *object, guint prop_id, @@ -1517,6 +1532,9 @@ garrow_csv_reader_set_property(GObject *object, priv->reader = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_CSV_READER_INPUT: + priv->input = GARROW_INPUT_STREAM(g_value_dup_object(value)); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1529,7 +1547,12 @@ garrow_csv_reader_get_property(GObject *object, GValue *value, GParamSpec *pspec) { + auto priv = GARROW_CSV_READER_GET_PRIVATE(object); + switch (prop_id) { + case PROP_CSV_READER_INPUT: + g_value_set_object(value, priv->input); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1539,25 +1562,37 @@ garrow_csv_reader_get_property(GObject *object, static void garrow_csv_reader_init(GArrowCSVReader *object) { + auto priv = GARROW_CSV_READER_GET_PRIVATE(object); + new(&priv->reader) std::shared_ptr; } static void garrow_csv_reader_class_init(GArrowCSVReaderClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->dispose = garrow_csv_reader_dispose; + gobject_class->finalize = garrow_csv_reader_finalize; gobject_class->set_property = garrow_csv_reader_set_property; gobject_class->get_property = garrow_csv_reader_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("csv-table-reader", "CSV table reader", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_CSV_TABLE_READER, spec); + + spec = g_param_spec_object("input", + "Input", + "The input stream to be read", + GARROW_TYPE_INPUT_STREAM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_CSV_READER_INPUT, + spec); } /** @@ -1597,7 +1632,7 @@ garrow_csv_reader_new(GArrowInputStream *input, parse_options, convert_options); if (garrow::check(error, arrow_reader, "[csv-reader][new]")) { - return garrow_csv_reader_new_raw(&(arrow_reader.ValueOrDie())); + return garrow_csv_reader_new_raw(&(*arrow_reader), input); } else { return NULL; } @@ -1633,11 +1668,11 @@ typedef struct GArrowJSONReadOptionsPrivate_ { } GArrowJSONReadOptionsPrivate; enum { - PROP_JSON_READER_USE_THREADS = 1, - PROP_JSON_READER_BLOCK_SIZE, - PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES, - PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR, - PROP_JSON_READER_SCHEMA + PROP_JSON_READ_OPTIONS_USE_THREADS = 1, + PROP_JSON_READ_OPTIONS_BLOCK_SIZE, + PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES, + PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR, + PROP_JSON_READ_OPTIONS_SCHEMA, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowJSONReadOptions, @@ -1671,20 +1706,20 @@ garrow_json_read_options_set_property(GObject *object, auto priv = GARROW_JSON_READ_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_JSON_READER_USE_THREADS: + case PROP_JSON_READ_OPTIONS_USE_THREADS: priv->read_options.use_threads = g_value_get_boolean(value); break; - case PROP_JSON_READER_BLOCK_SIZE: + case PROP_JSON_READ_OPTIONS_BLOCK_SIZE: priv->read_options.block_size = g_value_get_int(value); break; - case PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES: + case PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES: priv->parse_options.newlines_in_values = g_value_get_boolean(value); break; - case PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR: + case PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR: priv->parse_options.unexpected_field_behavior = static_cast(g_value_get_enum(value)); break; - case PROP_JSON_READER_SCHEMA: + case PROP_JSON_READ_OPTIONS_SCHEMA: { auto schema = g_value_dup_object(value); if (priv->schema) { @@ -1714,19 +1749,19 @@ garrow_json_read_options_get_property(GObject *object, auto priv = GARROW_JSON_READ_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_JSON_READER_USE_THREADS: + case PROP_JSON_READ_OPTIONS_USE_THREADS: g_value_set_boolean(value, priv->read_options.use_threads); break; - case PROP_JSON_READER_BLOCK_SIZE: + case PROP_JSON_READ_OPTIONS_BLOCK_SIZE: g_value_set_int(value, priv->read_options.block_size); break; - case PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES: + case PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES: g_value_set_boolean(value, priv->parse_options.newlines_in_values); break; - case PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR: + case PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR: g_value_set_enum(value, static_cast(priv->parse_options.unexpected_field_behavior)); break; - case PROP_JSON_READER_SCHEMA: + case PROP_JSON_READ_OPTIONS_SCHEMA: g_value_set_object(value, priv->schema); break; default: @@ -1769,7 +1804,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) read_options.use_threads, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_USE_THREADS, + PROP_JSON_READ_OPTIONS_USE_THREADS, spec); /** @@ -1790,7 +1825,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) read_options.block_size, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_BLOCK_SIZE, + PROP_JSON_READ_OPTIONS_BLOCK_SIZE, spec); @@ -1812,7 +1847,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) parse_options.newlines_in_values, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES, + PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES, spec); /** @@ -1829,7 +1864,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) GARROW_JSON_READ_INFER_TYPE, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR, + PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR, spec); /** @@ -1845,7 +1880,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) GARROW_TYPE_SCHEMA, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_SCHEMA, + PROP_JSON_READ_OPTIONS_SCHEMA, spec); } @@ -1866,10 +1901,12 @@ garrow_json_read_options_new(void) typedef struct GArrowJSONReaderPrivate_ { std::shared_ptr reader; + GArrowInputStream *input; } GArrowJSONReaderPrivate; enum { - PROP_JSON_TABLE_READER = 1 + PROP_JSON_TABLE_READER = 1, + PROP_JSON_READER_INPUT, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowJSONReader, @@ -1886,11 +1923,24 @@ garrow_json_reader_dispose(GObject *object) { auto priv = GARROW_JSON_READER_GET_PRIVATE(object); - priv->reader = nullptr; + if (priv->input) { + g_object_unref(priv->input); + priv->input = nullptr; + } G_OBJECT_CLASS(garrow_json_reader_parent_class)->dispose(object); } +static void +garrow_json_reader_finalize(GObject *object) +{ + auto priv = GARROW_JSON_READER_GET_PRIVATE(object); + + priv->reader.~shared_ptr(); + + G_OBJECT_CLASS(garrow_json_reader_parent_class)->finalize(object); +} + static void garrow_json_reader_set_property(GObject *object, guint prop_id, @@ -1904,6 +1954,9 @@ garrow_json_reader_set_property(GObject *object, priv->reader = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_JSON_READER_INPUT: + priv->input = GARROW_INPUT_STREAM(g_value_dup_object(value)); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1916,7 +1969,12 @@ garrow_json_reader_get_property(GObject *object, GValue *value, GParamSpec *pspec) { + auto priv = GARROW_JSON_READER_GET_PRIVATE(object); + switch (prop_id) { + case PROP_JSON_READER_INPUT: + g_value_set_object(value, priv->input); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1926,25 +1984,37 @@ garrow_json_reader_get_property(GObject *object, static void garrow_json_reader_init(GArrowJSONReader *object) { + auto priv = GARROW_JSON_READER_GET_PRIVATE(object); + new(&priv->reader) std::shared_ptr; } static void garrow_json_reader_class_init(GArrowJSONReaderClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->dispose = garrow_json_reader_dispose; + gobject_class->finalize = garrow_json_reader_finalize; gobject_class->set_property = garrow_json_reader_set_property; gobject_class->get_property = garrow_json_reader_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("json-table-reader", "JSON table reader", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_JSON_TABLE_READER, spec); + + spec = g_param_spec_object("input", + "Input", + "The input stream to be read", + GARROW_TYPE_INPUT_STREAM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_JSON_READER_INPUT, + spec); } /** @@ -1980,7 +2050,7 @@ garrow_json_reader_new(GArrowInputStream *input, } if (garrow::check(error, arrow_reader, "[json-reader][new]")) { - return garrow_json_reader_new_raw(&(arrow_reader.ValueOrDie())); + return garrow_json_reader_new_raw(&*arrow_reader, input); } else { return NULL; } @@ -2085,10 +2155,12 @@ garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader) } GArrowCSVReader * -garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader) +garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input) { auto reader = GARROW_CSV_READER(g_object_new(GARROW_TYPE_CSV_READER, "csv-table-reader", arrow_reader, + "input", input, NULL)); return reader; } @@ -2101,10 +2173,12 @@ garrow_csv_reader_get_raw(GArrowCSVReader *reader) } GArrowJSONReader * -garrow_json_reader_new_raw(std::shared_ptr *arrow_reader) +garrow_json_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input) { auto reader = GARROW_JSON_READER(g_object_new(GARROW_TYPE_JSON_READER, "json-table-reader", arrow_reader, + "input", input, NULL)); return reader; } diff --git a/c_glib/arrow-glib/reader.hpp b/c_glib/arrow-glib/reader.hpp index c1df700fe13..c7b2b76f215 100644 --- a/c_glib/arrow-glib/reader.hpp +++ b/c_glib/arrow-glib/reader.hpp @@ -44,11 +44,13 @@ std::shared_ptr garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader); GArrowCSVReader * -garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader); +garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input); std::shared_ptr garrow_csv_reader_get_raw(GArrowCSVReader *reader); GArrowJSONReader * -garrow_json_reader_new_raw(std::shared_ptr *arrow_reader); +garrow_json_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input); std::shared_ptr garrow_json_reader_get_raw(GArrowJSONReader *reader); diff --git a/c_glib/configure.ac b/c_glib/configure.ac deleted file mode 100644 index 58c75b45002..00000000000 --- a/c_glib/configure.ac +++ /dev/null @@ -1,346 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -AC_PREREQ(2.65) - -m4_define([arrow_glib_version], 4.0.0-SNAPSHOT) -AC_INIT([arrow-glib], - arrow_glib_version, - [https://issues.apache.org/jira/browse/ARROW], - [apache-arrow-glib]) -AC_CONFIG_AUX_DIR([config]) -AC_CONFIG_MACRO_DIR([m4]) - -AC_CONFIG_SRCDIR([arrow-glib/arrow-glib.h]) -AC_CONFIG_HEADERS([config.h]) - -AM_INIT_AUTOMAKE([1.13 foreign]) -AM_SILENT_RULES([yes]) - -GARROW_VERSION_MAJOR=$(echo "arrow_glib_version" | \ - sed -E -e 's/^([[0-9]]+)\..+$/\1/' | \ - tr -d '\n') -GARROW_VERSION_MINOR=$(echo "arrow_glib_version" | \ - sed -E -e 's/^[[0-9]]+\.([[0-9]]+)\..+$/\1/' | \ - tr -d '\n') -GARROW_VERSION_MICRO=$(echo "arrow_glib_version" | \ - sed -E -e 's/^[[0-9]]+\.[[0-9]]+\.([[0-9]]+).*$/\1/' | \ - tr -d '\n') -if echo "arrow_glib_version" | grep -- "-" > /dev/null; then - GARROW_VERSION_TAG=$(echo "arrow_glib_version" | \ - sed -E -e 's/^[[0-9]]+\.[[0-9]]+\.[[0-9]]+-(.+)$/\1/' | \ - tr -d '\n') -else - GARROW_VERSION_TAG= -fi -AC_SUBST(GARROW_VERSION_MAJOR) -AC_SUBST(GARROW_VERSION_MINOR) -AC_SUBST(GARROW_VERSION_MICRO) -AC_SUBST(GARROW_VERSION_TAG) - -GGANDIVA_VERSION_MAJOR=${GARROW_VERSION_MAJOR} -GGANDIVA_VERSION_MINOR=${GARROW_VERSION_MINOR} -GGANDIVA_VERSION_MICRO=${GARROW_VERSION_MICRO} -GGANDIVA_VERSION_TAG=${GARROW_VERSION_TAG} -AC_SUBST(GGANDIVA_VERSION_MAJOR) -AC_SUBST(GGANDIVA_VERSION_MINOR) -AC_SUBST(GGANDIVA_VERSION_MICRO) -AC_SUBST(GGANDIVA_VERSION_TAG) - -GPARQUET_VERSION_MAJOR=${GARROW_VERSION_MAJOR} -GPARQUET_VERSION_MINOR=${GARROW_VERSION_MINOR} -GPARQUET_VERSION_MICRO=${GARROW_VERSION_MICRO} -GPARQUET_VERSION_TAG=${GARROW_VERSION_TAG} -AC_SUBST(GPARQUET_VERSION_MAJOR) -AC_SUBST(GPARQUET_VERSION_MINOR) -AC_SUBST(GPARQUET_VERSION_MICRO) -AC_SUBST(GPARQUET_VERSION_TAG) - -AC_CANONICAL_HOST -AC_MSG_CHECKING([for macOS]) -case "$host_os" in -darwin*) - os_macos=yes - ;; -*) - os_macos=no - ;; -esac -AC_MSG_RESULT([$os_macos]) -AM_CONDITIONAL(OS_MACOS, test "$os_macos" = "yes") - -LT_INIT -LT_CURRENT=$(expr ${GARROW_VERSION_MAJOR} \* 100 + ${GARROW_VERSION_MINOR}) -LT_REVISION=${GARROW_VERSION_MICRO} -LT_AGE=0 -LT_VERSION_INFO="\$(LT_CURRENT):\$(LT_REVISION):\$(LT_AGE)" -AC_SUBST(LT_CURRENT) -AC_SUBST(LT_REVISION) -AC_SUBST(LT_AGE) -AC_SUBST(LT_VERSION_INFO) - -AC_PROG_CC -AC_PROG_CXX -AX_CXX_COMPILE_STDCXX_11([ext], [mandatory]) - -GARROW_CFLAGS="-Wall" -GARROW_CXXFLAGS="-Wall" -AC_ARG_ENABLE(debug, - [AS_HELP_STRING([--enable-debug], - [Use debug flags (default=no)])], - [GARROW_DEBUG="$enableval"], - [GARROW_DEBUG="no"]) -if test "x$GARROW_DEBUG" != "xno"; then - GARROW_DEBUG="yes" - if test "$CLANG" = "yes"; then - CFLAGS="$CFLAGS -O0 -g" - CXXFLAGS="$CXXFLAGS -O0 -g" - elif test "$GCC" = "yes"; then - CFLAGS="$CFLAGS -O0 -g3" - CXXFLAGS="$CXXFLAGS -O0 -g3" - fi -fi -AC_ARG_ENABLE(development-mode, - [AS_HELP_STRING([--enable-development-mode], - [Use development mode (default=no)])], - [GARROW_DEVELOPMENT_MODE="$enableval"], - [GARROW_DEVELOPMENT_MODE="no"]) -if test "x$GARROW_DEVELOPMENT_MODE" != "xno"; then - if test "$CLANG" = "yes" -o "$GCC" = "yes"; then - CFLAGS="$CFLAGS -Werror" - CXXFLAGS="$CXXFLAGS -Werror" - fi -fi -AC_SUBST(GARROW_CFLAGS) -AC_SUBST(GARROW_CXXFLAGS) - -AM_PATH_GLIB_2_0([2.32.4], - [], - [AC_MSG_ERROR(GLib isn't available)], - [gobject gio]) - -GOBJECT_INTROSPECTION_CHECK([1.32.1]) -GTK_DOC_CHECK([1.18-2]) - -AC_ARG_WITH(arrow-cpp-build-type, - [AS_HELP_STRING([--with-arrow-cpp-build-type=TYPE], - [-DCMAKE_BUILD_TYPE option value for Arrow C++ (default=release)])], - [GARROW_ARROW_CPP_BUILD_TYPE="$withval"], - [GARROW_ARROW_CPP_BUILD_TYPE="release"]) - -ARROW_CUDA_PKG_CONFIG_PATH="" -AC_ARG_WITH(arrow-cpp-build-dir, - [AS_HELP_STRING([--with-arrow-cpp-build-dir=PATH], - [Use this option to build with not installed Arrow C++])], - [GARROW_ARROW_CPP_BUILD_DIR="$withval"], - [GARROW_ARROW_CPP_BUILD_DIR=""]) -if test "x$GARROW_ARROW_CPP_BUILD_DIR" = "x"; then - USE_ARROW_BUILD_DIR=no - - arrow_packages="arrow" - arrow_packages="${arrow_packages} arrow-compute" - arrow_packages="${arrow_packages} arrow-csv" - arrow_packages="${arrow_packages} arrow-filesystem" - arrow_packages="${arrow_packages} arrow-json" - PKG_CHECK_MODULES([ARROW], [${arrow_packages}]) - _PKG_CONFIG(ARROW_LIB_DIR, [variable=libdir], [arrow]) - ARROW_LIB_DIR="$pkg_cv_ARROW_LIB_DIR" - PKG_CHECK_MODULES([ARROW_ORC], - [arrow-orc], - [HAVE_ARROW_ORC=yes], - [HAVE_ARROW_ORC=no]) - PKG_CHECK_MODULES([ARROW_CUDA], - [arrow-cuda], - [HAVE_ARROW_CUDA=yes], - [HAVE_ARROW_CUDA=no]) - PKG_CHECK_MODULES([ARROW_DATASET], - [arrow-dataset], - [HAVE_ARROW_DATASET=yes], - [HAVE_ARROW_DATASET=no]) - PKG_CHECK_MODULES([GANDIVA], - [gandiva], - [HAVE_GANDIVA=yes], - [HAVE_GANDIVA=no]) - PKG_CHECK_MODULES([PARQUET], - [parquet], - [HAVE_PARQUET=yes], - [HAVE_PARQUET=no]) - PKG_CHECK_MODULES([PLASMA], - [plasma], - [HAVE_PLASMA=yes], - [HAVE_PLASMA=no]) -else - USE_ARROW_BUILD_DIR=yes - - ARROW_BUILD_DIR="${GARROW_ARROW_CPP_BUILD_DIR}" - AC_SUBST(ARROW_BUILD_DIR) - - ARROW_SOURCE_INCLUDE_DIR="\$(abs_top_srcdir)/../cpp/src" - ARROW_BUILD_INCLUDE_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/src" - ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}" - AC_SUBST(ARROW_LIB_DIR) - - ARROW_CFLAGS="-I${ARROW_BUILD_INCLUDE_DIR} -I${ARROW_SOURCE_INCLUDE_DIR}" - ARROW_LIBS="-L\$(ARROW_LIB_DIR) -larrow" - AC_SUBST(ARROW_CFLAGS) - AC_SUBST(ARROW_LIBS) - - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/adapters/orc/arrow-orc.pc"; then - HAVE_ARROW_ORC=yes - else - HAVE_ARROW_ORC=no - fi - - ARROW_CUDA_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-cuda.pc"; then - HAVE_ARROW_CUDA=yes - ARROW_CUDA_LIBS="-L\$(ARROW_LIB_DIR) -larrow_cuda -larrow" - ARROW_CUDA_PKG_CONFIG_PATH="\$(ARROW_BUILD_DIR)/src/arrow/gpu" - else - HAVE_ARROW_CUDA=no - ARROW_CUDA_LIBS="" - ARROW_CUDA_PKG_CONFIG_PATH="" - fi - AC_SUBST(ARROW_CUDA_CFLAGS) - AC_SUBST(ARROW_CUDA_LIBS) - AC_SUBST(ARROW_CUDA_PKG_CONFIG_PATH) - - ARROW_DATASET_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/dataset/arrow-dataset.pc"; then - HAVE_ARROW_DATASET=yes - ARROW_DATASET_LIBS="-L\$(ARROW_LIB_DIR) -larrow_dataset -lparquet -larrow" - else - HAVE_ARROW_DATASET=no - ARROW_DATASET_LIBS="" - fi - AC_SUBST(ARROW_DATASET_CFLAGS) - AC_SUBST(ARROW_DATASET_LIBS) - - GANDIVA_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/gandiva/gandiva.pc"; then - HAVE_GANDIVA=yes - GANDIVA_LIBS="-L\$(ARROW_LIB_DIR) -lgandiva -larrow" - else - HAVE_GANDIVA=no - GANDIVA_LIBS="" - fi - AC_SUBST(GANDIVA_CFLAGS) - AC_SUBST(GANDIVA_LIBS) - - PARQUET_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/parquet/parquet.pc"; then - HAVE_PARQUET=yes - PARQUET_LIBS="-L\$(ARROW_LIB_DIR) -lparquet -larrow" - else - HAVE_PARQUET=no - PARQUET_LIBS="" - fi - AC_SUBST(PARQUET_CFLAGS) - AC_SUBST(PARQUET_LIBS) - - PLASMA_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/plasma/plasma.pc"; then - HAVE_PLASMA=yes - PLASMA_LIBS="-L\$(ARROW_LIB_DIR) -lplasma -larrow" - else - HAVE_PLASMA=no - PLASMA_LIBS="" - fi - AC_SUBST(PLASMA_CFLAGS) - AC_SUBST(PLASMA_LIBS) -fi - -AM_CONDITIONAL([USE_ARROW_BUILD_DIR], - [test "$USE_ARROW_BUILD_DIR" = "yes"]) - -AM_CONDITIONAL([HAVE_ARROW_ORC], [test "$HAVE_ARROW_ORC" = "yes"]) -if test "$HAVE_ARROW_ORC" = "yes"; then - AC_DEFINE(HAVE_ARROW_ORC, [1], [Define to 1 if Apache Arrow supports ORC.]) -fi - -AM_CONDITIONAL([HAVE_ARROW_CUDA], [test "$HAVE_ARROW_CUDA" = "yes"]) -if test "$HAVE_ARROW_CUDA" = "yes"; then - ARROW_CUDA_GLIB_PACKAGE="arrow-cuda-glib" - PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":\$(abs_top_builddir)/arrow-cuda-glib" - if test -n "${ARROW_CUDA_PKG_CONFIG_PATH}"; then - PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":${ARROW_CUDA_PKG_CONFIG_PATH}${PLASMA_ARROW_CUDA_PKG_CONFIG_PATH}" - fi - AC_DEFINE(HAVE_ARROW_CUDA, [1], [Define to 1 if Apache Arrow supports CUDA.]) -else - ARROW_CUDA_GLIB_PACKAGE="" - PLASMA_ARROW_CUDA_PKG_CONFIG_PATH="" -fi -AC_SUBST(ARROW_CUDA_GLIB_PACKAGE) -AC_SUBST(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH) - -AM_CONDITIONAL([HAVE_ARROW_DATASET], [test "$HAVE_ARROW_DATASET" = "yes"]) -if test "$HAVE_ARROW_DATASET" = "yes"; then - AC_DEFINE(HAVE_ARROW_DATASET, [1], [Define to 1 if Apache Arrow Dataset exists.]) -fi - -AM_CONDITIONAL([HAVE_GANDIVA], [test "$HAVE_GANDIVA" = "yes"]) -if test "$HAVE_GANDIVA" = "yes"; then - AC_DEFINE(HAVE_GANDIVA, [1], [Define to 1 if Gandiva exists.]) -fi - -AM_CONDITIONAL([HAVE_PARQUET], [test "$HAVE_PARQUET" = "yes"]) -if test "$HAVE_PARQUET" = "yes"; then - AC_DEFINE(HAVE_PARQUET, [1], [Define to 1 if Apache Parquet exists.]) -fi - -AM_CONDITIONAL([HAVE_PLASMA], [test "$HAVE_PLASMA" = "yes"]) -if test "$HAVE_PLASMA" = "yes"; then - AC_DEFINE(HAVE_PLASMA, [1], [Define to 1 if Plasma exists.]) -fi - -exampledir="\$(datadir)/arrow-glib/example" -AC_SUBST(exampledir) - -AC_CONFIG_FILES([ - Makefile - arrow-cuda-glib/Makefile - arrow-cuda-glib/arrow-cuda-glib.pc - arrow-dataset-glib/Makefile - arrow-dataset-glib/arrow-dataset-glib.pc - arrow-glib/Makefile - arrow-glib/arrow-glib.pc - arrow-glib/arrow-orc-glib.pc - arrow-glib/version.h - gandiva-glib/Makefile - gandiva-glib/gandiva-glib.pc - gandiva-glib/version.h - parquet-glib/Makefile - parquet-glib/parquet-glib.pc - parquet-glib/version.h - plasma-glib/Makefile - plasma-glib/plasma-glib.pc - doc/Makefile - doc/arrow-dataset-glib/Makefile - doc/arrow-dataset-glib/entities.xml - doc/arrow-glib/Makefile - doc/arrow-glib/entities.xml - doc/gandiva-glib/Makefile - doc/gandiva-glib/entities.xml - doc/parquet-glib/Makefile - doc/parquet-glib/entities.xml - doc/plasma-glib/Makefile - doc/plasma-glib/entities.xml - example/Makefile - example/lua/Makefile -]) - -AC_OUTPUT diff --git a/c_glib/doc/Makefile.am b/c_glib/doc/Makefile.am deleted file mode 100644 index a56e0415f3d..00000000000 --- a/c_glib/doc/Makefile.am +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -SUBDIRS = \ - arrow-glib \ - arrow-dataset-glib \ - gandiva-glib \ - parquet-glib \ - plasma-glib diff --git a/c_glib/doc/arrow-dataset-glib/Makefile.am b/c_glib/doc/arrow-dataset-glib/Makefile.am deleted file mode 100644 index d1c636143ff..00000000000 --- a/c_glib/doc/arrow-dataset-glib/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if HAVE_ARROW_DATASET -DOC_MODULE = arrow-dataset-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/arrow-dataset-glib \ - $(top_builddir)/arrow-dataset-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GARROW_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=gad \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/arrow-dataset-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/arrow-dataset-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(ARROW_DATASET_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la \ - $(top_builddir)/arrow-dataset-glib/libarrow-dataset-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/arrow-glib/Makefile.am b/c_glib/doc/arrow-glib/Makefile.am deleted file mode 100644 index db9f00f39f3..00000000000 --- a/c_glib/doc/arrow-glib/Makefile.am +++ /dev/null @@ -1,80 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -DOC_MODULE = arrow-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/arrow-glib \ - $(top_builddir)/arrow-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GARROW_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=garrow \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/arrow-glib/*.h \ - $(top_builddir)/arrow-glib/*.h - -IGNORE_HFILES = - -if !HAVE_ARROW_ORC -IGNORE_HFILES += \ - $(top_srcdir)/arrow-glib/orc-file-reader.h -endif - -CFILE_GLOB = \ - $(top_srcdir)/arrow-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la - -if HAVE_ARROW_CUDA -DOC_SOURCE_DIR += \ - $(top_srcdir)/arrow-cuda-glib -HFILE_GLOB += \ - $(top_srcdir)/arrow-cuda-glib/*.h -CFILE_GLOB += \ - $(top_srcdir)/arrow-cuda-glib/*.cpp -GTKDOC_LIBS += \ - $(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la -endif - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/gandiva-glib/Makefile.am b/c_glib/doc/gandiva-glib/Makefile.am deleted file mode 100644 index 16d333d0ae3..00000000000 --- a/c_glib/doc/gandiva-glib/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if HAVE_GANDIVA -DOC_MODULE = gandiva-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/gandiva-glib \ - $(top_builddir)/gandiva-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GGANDIVA_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=ggandiva \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/gandiva-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/gandiva-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(GANDIVA_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la \ - $(top_builddir)/gandiva-glib/libgandiva-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml b/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml index c90f53780aa..182bbfb527e 100644 --- a/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml +++ b/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml @@ -42,6 +42,14 @@ Expression + + Filter + + + + Selection vector + + Projector @@ -92,6 +100,10 @@ Index of deprecated API + + Index of new symbols in 4.0.0 + + Index of new symbols in 1.0.0 diff --git a/c_glib/doc/parquet-glib/Makefile.am b/c_glib/doc/parquet-glib/Makefile.am deleted file mode 100644 index d125be1b54c..00000000000 --- a/c_glib/doc/parquet-glib/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if HAVE_PARQUET -DOC_MODULE = parquet-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/parquet-glib \ - $(top_builddir)/parquet-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GPARQUET_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=gparquet \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/parquet-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/parquet-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(PARQUET_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/parquet-glib/libparquet-glib.la \ - $(top_builddir)/arrow-glib/libarrow-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/plasma-glib/Makefile.am b/c_glib/doc/plasma-glib/Makefile.am deleted file mode 100644 index df872d6ca31..00000000000 --- a/c_glib/doc/plasma-glib/Makefile.am +++ /dev/null @@ -1,76 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -PLASMA_ARROW_CUDA_GTKDOC_LIBS = -if HAVE_ARROW_CUDA -PLASMA_ARROW_CUDA_GTKDOC_LIBS += \ - $(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la -endif - -if HAVE_PLASMA -DOC_MODULE = plasma-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/plasma-glib \ - $(top_builddir)/plasma-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GPLASMA_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=gplasma \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/plasma-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/plasma-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(PLASMA_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la \ - $(PLASMA_ARROW_CUDA_GTKDOC_LIBS) \ - $(top_builddir)/plasma-glib/libplasma-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/example/Makefile.am b/c_glib/example/Makefile.am deleted file mode 100644 index 9e460ecf8e0..00000000000 --- a/c_glib/example/Makefile.am +++ /dev/null @@ -1,64 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -SUBDIRS = \ - lua - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) \ - -DGARROW_DISABLE_DEPRECATED - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) - -AM_LDFLAGS = \ - $(GLIB_LIBS) \ - $(builddir)/../arrow-glib/libarrow-glib.la -if USE_ARROW_BUILD_DIR -AM_LDFLAGS += \ - $(ARROW_LIBS) -endif - -noinst_PROGRAMS = \ - build \ - extension-type \ - read-batch \ - read-stream - -build_SOURCES = \ - build.c - -extension_type_SOURCES = \ - extension-type.c - -read_batch_SOURCES = \ - read-batch.c - -read_stream_SOURCES = \ - read-stream.c - -dist_example_DATA = \ - README.md \ - $(build_SOURCES) \ - $(extension_type_SOURCES) \ - $(read_batch_SOURCES) \ - $(read_stream_SOURCES) diff --git a/c_glib/example/lua/Makefile.am b/c_glib/example/lua/Makefile.am deleted file mode 100644 index 84ddbc7607b..00000000000 --- a/c_glib/example/lua/Makefile.am +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -EXTRA_DIST = \ - meson.build - -lua_exampledir = $(exampledir)/lua -dist_lua_example_DATA = \ - README.md \ - read-batch.lua \ - read-stream.lua \ - write-batch.lua \ - write-stream.lua diff --git a/c_glib/gandiva-glib/Makefile.am b/c_glib/gandiva-glib/Makefile.am deleted file mode 100644 index 5991abeab3a..00000000000 --- a/c_glib/gandiva-glib/Makefile.am +++ /dev/null @@ -1,196 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GGANDIVA_CFLAGS) - -if HAVE_GANDIVA -lib_LTLIBRARIES = \ - libgandiva-glib.la - -libgandiva_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(GANDIVA_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GGANDIVA_CFLAGS) - -libgandiva_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libgandiva_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(GANDIVA_LIBS) - -libgandiva_glib_la_headers = \ - expression.h \ - function-registry.h \ - function-signature.h \ - gandiva-glib.h \ - native-function.h \ - node.h \ - projector.h - -libgandiva_glib_la_generated_headers = \ - enums.h \ - version.h - -libgandiva_glib_la_generated_sources = \ - enums.c \ - $(libgandiva_glib_la_generated_headers) - -libgandiva_glib_la_sources = \ - expression.cpp \ - function-registry.cpp \ - function-signature.cpp \ - node.cpp \ - native-function.cpp \ - projector.cpp \ - $(libgandiva_glib_la_headers) \ - $(libgandiva_glib_la_generated_sources) - -libgandiva_glib_la_cpp_headers = \ - expression.hpp \ - function-signature.hpp \ - gandiva-glib.hpp \ - native-function.hpp \ - node.hpp \ - projector.hpp - -libgandiva_glib_la_SOURCES = \ - $(libgandiva_glib_la_sources) \ - $(libgandiva_glib_la_cpp_headers) - -BUILT_SOURCES = \ - $(libgandiva_glib_la_generated_sources) \ - stamp-enums.c \ - stamp-enums.h - -DISTCLEANFILES += \ - stamp-enums.c \ - stamp-enums.h - -EXTRA_DIST += \ - enums.c.template \ - enums.h.template - -enums.h: stamp-enums.h - @true -stamp-enums.h: $(libgandiva_glib_la_headers) enums.h.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GGandiva \ - --symbol-prefix ggandiva \ - --template enums.h.template \ - $(libgandiva_glib_la_headers)) > enums.h - touch $@ - -enums.c: stamp-enums.c - @true -stamp-enums.c: $(libarrow_glib_la_headers) enums.c.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GGandiva \ - --symbol-prefix ggandiva \ - --template enums.c.template \ - $(libgandiva_glib_la_headers)) > enums.c - touch $@ - -gandiva_glib_includedir = $(includedir)/gandiva-glib -gandiva_glib_include_HEADERS = \ - $(libgandiva_glib_la_headers) \ - $(libgandiva_glib_la_cpp_headers) \ - $(libgandiva_glib_la_generated_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - gandiva-glib.pc - -# GObject Introspection -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_top_builddir)/arrow-glib - -Gandiva-1.0.gir: libgandiva-glib.la -Gandiva_1_0_gir_PACKAGES = \ - arrow-glib -Gandiva_1_0_gir_EXPORT_PACKAGES = \ - gandiva-glib -Gandiva_1_0_gir_INCLUDES = \ - Arrow-1.0 -Gandiva_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Gandiva_1_0_gir_LIBS = -Gandiva_1_0_gir_FILES = $(libgandiva_glib_la_sources) -Gandiva_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_top_builddir)/arrow-glib \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GGandiva \ - --symbol-prefix=ggandiva -if OS_MACOS -Gandiva_1_0_gir_LIBS += \ - arrow-glib \ - gandiva-glib -Gandiva_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_top_builddir)/arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -Gandiva_1_0_gir_LIBS += \ - $(abs_top_builddir)/arrow-glib/libarrow-glib.la \ - libgandiva-glib.la -endif -INTROSPECTION_GIRS += Gandiva-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/gandiva-glib/expression.cpp b/c_glib/gandiva-glib/expression.cpp index d3c2f58dfdb..2ad98bfc007 100644 --- a/c_glib/gandiva-glib/expression.cpp +++ b/c_glib/gandiva-glib/expression.cpp @@ -36,6 +36,8 @@ G_BEGIN_DECLS * #GGandivaExpression is a class for an expression tree with a root node, * and a result field. * + * #GGandivaCondition is a class for an expression that returns boolean. + * * Since: 0.12.0 */ @@ -217,6 +219,40 @@ ggandiva_expression_to_string(GGandivaExpression *expression) return g_strndup(string.data(), string.size()); } + +G_DEFINE_TYPE(GGandivaCondition, + ggandiva_condition, + GGANDIVA_TYPE_EXPRESSION) + +static void +ggandiva_condition_init(GGandivaCondition *object) +{ +} + +static void +ggandiva_condition_class_init(GGandivaConditionClass *klass) +{ +} + +/** + * ggandiva_condition_new: + * @root_node: The root node for the condition. + * + * Returns: A newly created #GGandivaCondition. + * + * Since: 4.0.0 + */ +GGandivaCondition * +ggandiva_condition_new(GGandivaNode *root_node) +{ + auto gandiva_root_node = ggandiva_node_get_raw(root_node); + auto gandiva_condition = + gandiva::TreeExprBuilder::MakeCondition(gandiva_root_node); + return ggandiva_condition_new_raw(&gandiva_condition, + root_node); +} + + G_END_DECLS GGandivaExpression * @@ -238,3 +274,25 @@ ggandiva_expression_get_raw(GGandivaExpression *expression) auto priv = GGANDIVA_EXPRESSION_GET_PRIVATE(expression); return priv->expression; } + + +GGandivaCondition * +ggandiva_condition_new_raw(std::shared_ptr *gandiva_condition, + GGandivaNode *root_node) +{ + auto arrow_result_field = (*gandiva_condition)->result(); + auto result_field = garrow_field_new_raw(&arrow_result_field, nullptr); + auto condition = g_object_new(GGANDIVA_TYPE_CONDITION, + "expression", gandiva_condition, + "root-node", root_node, + "result-field", result_field, + NULL); + return GGANDIVA_CONDITION(condition); +} + +std::shared_ptr +ggandiva_condition_get_raw(GGandivaCondition *condition) +{ + return std::static_pointer_cast( + ggandiva_expression_get_raw(GGANDIVA_EXPRESSION(condition))); +} diff --git a/c_glib/gandiva-glib/expression.h b/c_glib/gandiva-glib/expression.h index f86b6c504c2..0a720d9afbd 100644 --- a/c_glib/gandiva-glib/expression.h +++ b/c_glib/gandiva-glib/expression.h @@ -37,8 +37,27 @@ struct _GGandivaExpressionClass GObjectClass parent_class; }; -GGandivaExpression *ggandiva_expression_new(GGandivaNode *root_node, - GArrowField *result_field); +GGandivaExpression * +ggandiva_expression_new(GGandivaNode *root_node, + GArrowField *result_field); gchar *ggandiva_expression_to_string(GGandivaExpression *expression); + +#define GGANDIVA_TYPE_CONDITION (ggandiva_condition_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaCondition, + ggandiva_condition, + GGANDIVA, + CONDITION, + GGandivaExpression) + +struct _GGandivaConditionClass +{ + GGandivaExpressionClass parent_class; +}; + +GGANDIVA_AVAILABLE_IN_4_0 +GGandivaCondition * +ggandiva_condition_new(GGandivaNode *root_node); + + G_END_DECLS diff --git a/c_glib/gandiva-glib/expression.hpp b/c_glib/gandiva-glib/expression.hpp index a0d0e64c076..45b6593937f 100644 --- a/c_glib/gandiva-glib/expression.hpp +++ b/c_glib/gandiva-glib/expression.hpp @@ -26,8 +26,14 @@ #include -GGandivaExpression -*ggandiva_expression_new_raw(std::shared_ptr *gandiva_expression, - GGandivaNode *root_node, - GArrowField *result_field); +GGandivaExpression * +ggandiva_expression_new_raw(std::shared_ptr *gandiva_expression, + GGandivaNode *root_node, + GArrowField *result_field); std::shared_ptr ggandiva_expression_get_raw(GGandivaExpression *expression); + +GGandivaCondition +*ggandiva_condition_new_raw(std::shared_ptr *gandiva_expression, + GGandivaNode *root_node); +std::shared_ptr +ggandiva_condition_get_raw(GGandivaCondition *condition); diff --git a/c_glib/gandiva-glib/filter.cpp b/c_glib/gandiva-glib/filter.cpp new file mode 100644 index 00000000000..34e04fcd30c --- /dev/null +++ b/c_glib/gandiva-glib/filter.cpp @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#include +#include +#include +#include + +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: filter + * @title: Filter classes + * @include: gandiva-glib/gandiva-glib.h + * + * #GGandivaFilter is a class for selecting records by a specific + * condition. + * + * Since: 4.0.0 + */ + +typedef struct GGandivaFilterPrivate_ { + std::shared_ptr filter; + GArrowSchema *schema; + GGandivaCondition *condition; +} GGandivaFilterPrivate; + +enum { + PROP_FILTER = 1, + PROP_SCHEMA, + PROP_CONDITION, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GGandivaFilter, + ggandiva_filter, + G_TYPE_OBJECT) + +#define GGANDIVA_FILTER_GET_PRIVATE(obj) \ + static_cast( \ + ggandiva_filter_get_instance_private( \ + GGANDIVA_FILTER(obj))) + +static void +ggandiva_filter_dispose(GObject *object) +{ + auto priv = GGANDIVA_FILTER_GET_PRIVATE(object); + + if (priv->schema) { + g_object_unref(priv->schema); + priv->schema = nullptr; + } + + if (priv->condition) { + g_object_unref(priv->condition); + priv->condition = nullptr; + } + + G_OBJECT_CLASS(ggandiva_filter_parent_class)->dispose(object); +} + +static void +ggandiva_filter_finalize(GObject *object) +{ + auto priv = GGANDIVA_FILTER_GET_PRIVATE(object); + + priv->filter.~shared_ptr(); + + G_OBJECT_CLASS(ggandiva_filter_parent_class)->finalize(object); +} + +static void +ggandiva_filter_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_FILTER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FILTER: + priv->filter = + *static_cast *>(g_value_get_pointer(value)); + break; + case PROP_SCHEMA: + priv->schema = GARROW_SCHEMA(g_value_dup_object(value)); + break; + case PROP_CONDITION: + priv->condition = GGANDIVA_CONDITION(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_filter_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_FILTER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_SCHEMA: + g_value_set_object(value, priv->schema); + break; + case PROP_CONDITION: + g_value_set_object(value, priv->condition); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_filter_init(GGandivaFilter *object) +{ + auto priv = GGANDIVA_FILTER_GET_PRIVATE(object); + new(&priv->filter) std::shared_ptr; +} + +static void +ggandiva_filter_class_init(GGandivaFilterClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = ggandiva_filter_dispose; + gobject_class->finalize = ggandiva_filter_finalize; + gobject_class->set_property = ggandiva_filter_set_property; + gobject_class->get_property = ggandiva_filter_get_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("filter", + "Filter", + "The raw std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FILTER, spec); + + spec = g_param_spec_object("schema", + "Schema", + "The schema for input record batch", + GARROW_TYPE_SCHEMA, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_SCHEMA, spec); + + spec = g_param_spec_object("condition", + "Condition", + "The condition for the filter", + GGANDIVA_TYPE_CONDITION, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CONDITION, spec); +} + +/** + * ggandiva_filter_new: + * @schema: A #GArrowSchema. + * @condition: The condition to be used. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GGandivaFilter on success, + * %NULL on error. + * + * Since: 4.0.0 + */ +GGandivaFilter * +ggandiva_filter_new(GArrowSchema *schema, + GGandivaCondition *condition, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + auto gandiva_condition = ggandiva_condition_get_raw(condition); + std::shared_ptr gandiva_filter; + auto status = gandiva::Filter::Make(arrow_schema, + gandiva_condition, + &gandiva_filter); + if (garrow_error_check(error, status, "[gandiva][filter][new]")) { + return ggandiva_filter_new_raw(&gandiva_filter, schema, condition); + } else { + return NULL; + } +} + +/** + * ggandiva_filter_evaluate: + * @filter: A #GGandivaFilter. + * @record_batch: A #GArrowRecordBatch. + * @selection_vector: A #GGandivaSelectionVector that is used as + * output. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 4.0.0 + */ +gboolean +ggandiva_filter_evaluate(GGandivaFilter *filter, + GArrowRecordBatch *record_batch, + GGandivaSelectionVector *selection_vector, + GError **error) +{ + auto gandiva_filter = ggandiva_filter_get_raw(filter); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + auto gandiva_selection_vector = + ggandiva_selection_vector_get_raw(selection_vector); + auto status = gandiva_filter->Evaluate(*arrow_record_batch, + gandiva_selection_vector); + return garrow_error_check(error, status, "[gandiva][filter][evaluate]"); +} + +G_END_DECLS + +GGandivaFilter * +ggandiva_filter_new_raw(std::shared_ptr *gandiva_filter, + GArrowSchema *schema, + GGandivaCondition *condition) +{ + auto filter = g_object_new(GGANDIVA_TYPE_FILTER, + "filter", gandiva_filter, + "schema", schema, + "condition", condition, + NULL); + return GGANDIVA_FILTER(filter); +} + +std::shared_ptr +ggandiva_filter_get_raw(GGandivaFilter *filter) +{ + auto priv = GGANDIVA_FILTER_GET_PRIVATE(filter); + return priv->filter; +} diff --git a/c_glib/gandiva-glib/filter.h b/c_glib/gandiva-glib/filter.h new file mode 100644 index 00000000000..9a0a5dc5d85 --- /dev/null +++ b/c_glib/gandiva-glib/filter.h @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +G_BEGIN_DECLS + +#define GGANDIVA_TYPE_FILTER (ggandiva_filter_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaFilter, + ggandiva_filter, + GGANDIVA, + FILTER, + GObject) + +struct _GGandivaFilterClass +{ + GObjectClass parent_class; +}; + +GGandivaFilter * +ggandiva_filter_new(GArrowSchema *schema, + GGandivaCondition *condition, + GError **error); +gboolean +ggandiva_filter_evaluate(GGandivaFilter *filter, + GArrowRecordBatch *record_batch, + GGandivaSelectionVector *selection_vector, + GError **error); + +G_END_DECLS diff --git a/c_glib/gandiva-glib/filter.hpp b/c_glib/gandiva-glib/filter.hpp new file mode 100644 index 00000000000..a0bee9120a7 --- /dev/null +++ b/c_glib/gandiva-glib/filter.hpp @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +#include + +GGandivaFilter * +ggandiva_filter_new_raw(std::shared_ptr *gandiva_filter, + GArrowSchema *schema, + GGandivaCondition *condition); +std::shared_ptr +ggandiva_filter_get_raw(GGandivaFilter *filter); diff --git a/c_glib/gandiva-glib/gandiva-glib.h b/c_glib/gandiva-glib/gandiva-glib.h index 7d1c3d92696..9c1a1604d39 100644 --- a/c_glib/gandiva-glib/gandiva-glib.h +++ b/c_glib/gandiva-glib/gandiva-glib.h @@ -22,8 +22,10 @@ #include #include +#include #include #include #include #include #include +#include diff --git a/c_glib/gandiva-glib/gandiva-glib.hpp b/c_glib/gandiva-glib/gandiva-glib.hpp index 8d857a3d8df..eb39f5838ee 100644 --- a/c_glib/gandiva-glib/gandiva-glib.hpp +++ b/c_glib/gandiva-glib/gandiva-glib.hpp @@ -22,5 +22,7 @@ #include #include +#include #include #include +#include diff --git a/c_glib/gandiva-glib/gandiva-glib.pc.in b/c_glib/gandiva-glib/gandiva-glib.pc.in deleted file mode 100644 index 7160f5ff422..00000000000 --- a/c_glib/gandiva-glib/gandiva-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Arrow Gandiva GLib -Description: C API for Apache Arrow Gandiva based on GLib -Version: @VERSION@ -Libs: -L${libdir} -lgandiva-glib -Cflags: -I${includedir} -Requires: gandiva arrow-glib diff --git a/c_glib/gandiva-glib/meson.build b/c_glib/gandiva-glib/meson.build index d4ee81dd22b..5127d67afca 100644 --- a/c_glib/gandiva-glib/meson.build +++ b/c_glib/gandiva-glib/meson.build @@ -21,30 +21,36 @@ project_name = 'gandiva-glib' sources = files( 'expression.cpp', + 'filter.cpp', 'function-registry.cpp', 'function-signature.cpp', - 'node.cpp', 'native-function.cpp', + 'node.cpp', 'projector.cpp', + 'selection-vector.cpp', ) c_headers = files( 'expression.h', + 'filter.h', 'function-registry.h', 'function-signature.h', 'gandiva-glib.h', - 'node.h', 'native-function.h', + 'node.h', 'projector.h', + 'selection-vector.h', ) cpp_headers = files( 'expression.hpp', + 'filter.hpp', 'function-signature.hpp', 'gandiva-glib.hpp', - 'node.hpp', 'native-function.hpp', + 'node.hpp', 'projector.hpp', + 'selection-vector.hpp', ) version_h_conf = configuration_data() diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index 28106624494..68f65c6fa14 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -116,9 +116,9 @@ enum { PROP_RETURN_TYPE }; -G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode, - ggandiva_node, - G_TYPE_OBJECT) +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GGandivaNode, + ggandiva_node, + G_TYPE_OBJECT) #define GGANDIVA_NODE_GET_PRIVATE(object) \ static_cast( \ diff --git a/c_glib/gandiva-glib/projector.cpp b/c_glib/gandiva-glib/projector.cpp index ebcc4873c8d..c1cb19e2d2c 100644 --- a/c_glib/gandiva-glib/projector.cpp +++ b/c_glib/gandiva-glib/projector.cpp @@ -22,12 +22,13 @@ #endif #include +#include #include #include -#include #include #include +#include G_BEGIN_DECLS @@ -36,19 +37,25 @@ G_BEGIN_DECLS * @title: Projector classes * @include: gandiva-glib/gandiva-glib.h * - * #GGandivaProjector is a class for building a specific schema - * and vector of expressions. + * #GGandivaProjector is a class that evaluates given expressions + * against the given record batches. + * + * #GGandivaSelectableProjector is a class that evaluates given expressions + * against the given selected records in the given record batches. * * Since: 0.12.0 */ typedef struct GGandivaProjectorPrivate_ { std::shared_ptr projector; + GArrowSchema *schema; + GList *expressions; } GGandivaProjectorPrivate; enum { - PROP_0, - PROP_PROJECTOR + PROP_PROJECTOR = 1, + PROP_SCHEMA, + PROP_EXPRESSIONS, }; G_DEFINE_TYPE_WITH_PRIVATE(GGandivaProjector, @@ -60,6 +67,22 @@ G_DEFINE_TYPE_WITH_PRIVATE(GGandivaProjector, ggandiva_projector_get_instance_private( \ GGANDIVA_PROJECTOR(obj))) +static void +ggandiva_projector_dispose(GObject *object) +{ + auto priv = GGANDIVA_PROJECTOR_GET_PRIVATE(object); + + if (priv->schema) { + g_object_unref(G_OBJECT(priv->schema)); + priv->schema = nullptr; + } + + g_list_free_full(priv->expressions, g_object_unref); + priv->expressions = nullptr; + + G_OBJECT_CLASS(ggandiva_projector_parent_class)->dispose(object); +} + static void ggandiva_projector_finalize(GObject *object) { @@ -83,6 +106,33 @@ ggandiva_projector_set_property(GObject *object, priv->projector = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_SCHEMA: + priv->schema = GARROW_SCHEMA(g_value_dup_object(value)); + break; + case PROP_EXPRESSIONS: + priv->expressions = + g_list_copy_deep(static_cast(g_value_get_pointer(value)), + reinterpret_cast(g_object_ref), + nullptr); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_projector_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_PROJECTOR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_SCHEMA: + g_value_set_object(value, priv->schema); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -99,19 +149,35 @@ ggandiva_projector_init(GGandivaProjector *object) static void ggandiva_projector_class_init(GGandivaProjectorClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = ggandiva_projector_dispose; gobject_class->finalize = ggandiva_projector_finalize; gobject_class->set_property = ggandiva_projector_set_property; + gobject_class->get_property = ggandiva_projector_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("projector", "Projector", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_PROJECTOR, spec); + + spec = g_param_spec_object("schema", + "Schema", + "The schema of the projector", + GARROW_TYPE_SCHEMA, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_SCHEMA, spec); + + spec = g_param_spec_pointer("expressions", + "Expressions", + "The expressions for the projector", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_EXPRESSIONS, spec); } /** @@ -143,7 +209,9 @@ ggandiva_projector_new(GArrowSchema *schema, gandiva_expressions, &gandiva_projector); if (garrow_error_check(error, status, "[gandiva][projector][new]")) { - return ggandiva_projector_new_raw(&gandiva_projector); + return ggandiva_projector_new_raw(&gandiva_projector, + schema, + expressions); } else { return NULL; } @@ -185,17 +253,140 @@ ggandiva_projector_evaluate(GGandivaProjector *projector, } } + +G_DEFINE_TYPE(GGandivaSelectableProjector, + ggandiva_selectable_projector, + GGANDIVA_TYPE_PROJECTOR) + +static void +ggandiva_selectable_projector_init(GGandivaSelectableProjector *object) +{ +} + +static void +ggandiva_selectable_projector_class_init(GGandivaSelectableProjectorClass *klass) +{ +} + +/** + * ggandiva_selectable_projector_new: + * @schema: A #GArrowSchema. + * @expressions: (element-type GGandivaExpression): The built expressions. + * @mode: A #GGandivaSelectionVectorMode to be used. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GGandivaProjector on success, + * %NULL on error. + * + * Since: 4.0.0 + */ +GGandivaSelectableProjector * +ggandiva_selectable_projector_new(GArrowSchema *schema, + GList *expressions, + GGandivaSelectionVectorMode mode, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> gandiva_expressions; + for (auto node = expressions; node; node = g_list_next(node)) { + auto expression = GGANDIVA_EXPRESSION(node->data); + auto gandiva_expression = ggandiva_expression_get_raw(expression); + gandiva_expressions.push_back(gandiva_expression); + } + auto gandiva_mode = static_cast(mode); + auto gandiva_configuration = + gandiva::ConfigurationBuilder::DefaultConfiguration(); + std::shared_ptr gandiva_projector; + auto status = gandiva_projector->Make(arrow_schema, + gandiva_expressions, + gandiva_mode, + gandiva_configuration, + &gandiva_projector); + if (garrow_error_check(error, + status, + "[gandiva][selectable-projector][new]")) { + return ggandiva_selectable_projector_new_raw(&gandiva_projector, + schema, + expressions); + } else { + return NULL; + } +} + +/** + * ggandiva_selectable_projector_evaluate: + * @projector: A #GGandivaSelectableProjector. + * @record_batch: A #GArrowRecordBatch. + * @selection_vector: A #GGandivaSelectionVector that specifies + * the filtered row positions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (element-type GArrowArray) (nullable) (transfer full): + * The #GArrowArray as the result evaluated on success, %NULL on error. + * + * Since: 4.0.0 + */ +GList * +ggandiva_selectable_projector_evaluate( + GGandivaSelectableProjector *projector, + GArrowRecordBatch *record_batch, + GGandivaSelectionVector *selection_vector, + GError **error) +{ + auto gandiva_projector = + ggandiva_projector_get_raw(GGANDIVA_PROJECTOR(projector)); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + auto gandiva_selection_vector = + ggandiva_selection_vector_get_raw(selection_vector).get(); + auto memory_pool = arrow::default_memory_pool(); + arrow::ArrayVector arrow_arrays; + auto status = + gandiva_projector->Evaluate(*arrow_record_batch, + gandiva_selection_vector, + memory_pool, + &arrow_arrays); + if (garrow_error_check(error, + status, + "[gandiva][selectable-projector][evaluate]")) { + GList *arrays = NULL; + for (auto arrow_array : arrow_arrays) { + auto array = garrow_array_new_raw(&arrow_array); + arrays = g_list_prepend(arrays, array); + } + return g_list_reverse(arrays); + } else { + return NULL; + } +} + G_END_DECLS GGandivaProjector * -ggandiva_projector_new_raw(std::shared_ptr *gandiva_projector) +ggandiva_projector_new_raw( + std::shared_ptr *gandiva_projector, + GArrowSchema *schema, + GList *expressions) { auto projector = g_object_new(GGANDIVA_TYPE_PROJECTOR, "projector", gandiva_projector, + "schema", schema, + "expressions", expressions, NULL); return GGANDIVA_PROJECTOR(projector); } +GGandivaSelectableProjector * +ggandiva_selectable_projector_new_raw( + std::shared_ptr *gandiva_projector, + GArrowSchema *schema, + GList *expressions) +{ + auto projector = g_object_new(GGANDIVA_TYPE_SELECTABLE_PROJECTOR, + "projector", gandiva_projector, + NULL); + return GGANDIVA_SELECTABLE_PROJECTOR(projector); +} + std::shared_ptr ggandiva_projector_get_raw(GGandivaProjector *projector) { diff --git a/c_glib/gandiva-glib/projector.h b/c_glib/gandiva-glib/projector.h index ae6dead9521..5dd218b808c 100644 --- a/c_glib/gandiva-glib/projector.h +++ b/c_glib/gandiva-glib/projector.h @@ -19,7 +19,7 @@ #pragma once -#include +#include G_BEGIN_DECLS @@ -35,11 +35,41 @@ struct _GGandivaProjectorClass GObjectClass parent_class; }; -GGandivaProjector *ggandiva_projector_new(GArrowSchema *schema, - GList *expressions, - GError **error); -GList *ggandiva_projector_evaluate(GGandivaProjector *projector, - GArrowRecordBatch *record_batch, - GError **error); +GGandivaProjector * +ggandiva_projector_new(GArrowSchema *schema, + GList *expressions, + GError **error); +GList * +ggandiva_projector_evaluate(GGandivaProjector *projector, + GArrowRecordBatch *record_batch, + GError **error); + + +#define GGANDIVA_TYPE_SELECTABLE_PROJECTOR \ + (ggandiva_selectable_projector_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaSelectableProjector, + ggandiva_selectable_projector, + GGANDIVA, + SELECTABLE_PROJECTOR, + GGandivaProjector) + +struct _GGandivaSelectableProjectorClass +{ + GGandivaProjectorClass parent_class; +}; + +GGANDIVA_AVAILABLE_IN_4_0 +GGandivaSelectableProjector * +ggandiva_selectable_projector_new(GArrowSchema *schema, + GList *expressions, + GGandivaSelectionVectorMode mode, + GError **error); +GGANDIVA_AVAILABLE_IN_4_0 +GList * +ggandiva_selectable_projector_evaluate(GGandivaSelectableProjector *projector, + GArrowRecordBatch *record_batch, + GGandivaSelectionVector *selection_vector, + GError **error); + G_END_DECLS diff --git a/c_glib/gandiva-glib/projector.hpp b/c_glib/gandiva-glib/projector.hpp index 1e9359b3342..b372f32f598 100644 --- a/c_glib/gandiva-glib/projector.hpp +++ b/c_glib/gandiva-glib/projector.hpp @@ -25,5 +25,15 @@ #include -GGandivaProjector *ggandiva_projector_new_raw(std::shared_ptr *gandiva_projector); -std::shared_ptr ggandiva_projector_get_raw(GGandivaProjector *projector); +GGandivaProjector * +ggandiva_projector_new_raw( + std::shared_ptr *gandiva_projector, + GArrowSchema *schema, + GList *expressions); +GGandivaSelectableProjector * +ggandiva_selectable_projector_new_raw( + std::shared_ptr *gandiva_projector, + GArrowSchema *schema, + GList *expressions); +std::shared_ptr +ggandiva_projector_get_raw(GGandivaProjector *projector); diff --git a/c_glib/gandiva-glib/selection-vector.cpp b/c_glib/gandiva-glib/selection-vector.cpp new file mode 100644 index 00000000000..1c1fa0448fa --- /dev/null +++ b/c_glib/gandiva-glib/selection-vector.cpp @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include + + +G_BEGIN_DECLS + +/** + * SECTION: selection-vector + * @section_id: selection-vector-classes + * @title: Selection vector classes + * @include: gandiva-glib/gandiva-glib.h + * + * #GGandivaSelectionVector is a base class for a selection vector. + * + * #GGandivaUInt16SelectionVector is a class for a selection vector + * that uses 16-bit unsigned integer for each index. + * + * #GGandivaUInt32SelectionVector is a class for a selection vector + * that uses 32-bit unsigned integer for each index. + * + * #GGandivaUInt64SelectionVector is a class for a selection vector + * that uses 64-bit unsigned integer for each index. + * + * Since: 4.0.0 + */ + +typedef struct GGandivaSelectionVectorPrivate_ { + std::shared_ptr selection_vector; +} GGandivaSelectionVectorPrivate; + +enum { + PROP_SELECTION_VECTOR = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GGandivaSelectionVector, + ggandiva_selection_vector, + G_TYPE_OBJECT) + +#define GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object) \ + static_cast( \ + ggandiva_selection_vector_get_instance_private( \ + GGANDIVA_SELECTION_VECTOR(object))) + +static void +ggandiva_selection_vector_finalize(GObject *object) +{ + auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object); + + priv->selection_vector.~shared_ptr(); + + G_OBJECT_CLASS(ggandiva_selection_vector_parent_class)->finalize(object); +} + +static void +ggandiva_selection_vector_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_SELECTION_VECTOR: + priv->selection_vector = + *static_cast *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_selection_vector_init(GGandivaSelectionVector *object) +{ + auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object); + new(&priv->selection_vector) std::shared_ptr; +} + +static void +ggandiva_selection_vector_class_init(GGandivaSelectionVectorClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = ggandiva_selection_vector_finalize; + gobject_class->set_property = ggandiva_selection_vector_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("selection-vector", + "Selection vector", + "The raw std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_SELECTION_VECTOR, spec); +} + +/** + * ggandiva_selection_vector_get_mode: + * @selection_vector: A #GGandivaSelectionVector. + * + * Returns: A #GGandivaSelectionVectorMode for the selection vector. + * + * Since: 4.0.0 + */ +GGandivaSelectionVectorMode +ggandiva_selection_vector_get_mode(GGandivaSelectionVector *selection_vector) +{ + auto gandiva_selection_vector = + ggandiva_selection_vector_get_raw(selection_vector); + auto gandiva_mode = gandiva_selection_vector->GetMode(); + return static_cast(gandiva_mode); +} + +/** + * ggandiva_selection_vector_to_array: + * @selection_vector: A #GGandivaSelectionVector. + * + * Returns: (transfer full): A #GArrowArray that has the same content + * of the selection vector. + * + * Since: 4.0.0 + */ +GArrowArray * +ggandiva_selection_vector_to_array(GGandivaSelectionVector *selection_vector) +{ + auto gandiva_selection_vector = + ggandiva_selection_vector_get_raw(selection_vector); + auto arrow_array = gandiva_selection_vector->ToArray(); + return garrow_array_new_raw(&arrow_array); +} + + +G_DEFINE_TYPE(GGandivaUInt16SelectionVector, + ggandiva_uint16_selection_vector, + GGANDIVA_TYPE_SELECTION_VECTOR) + +static void +ggandiva_uint16_selection_vector_init( + GGandivaUInt16SelectionVector *selection_vector) +{ +} + +static void +ggandiva_uint16_selection_vector_class_init( + GGandivaUInt16SelectionVectorClass *klass) +{ +} + +/** + * ggandiva_uint16_selection_vector_new: + * @max_slots: The max number of slots. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GGandivaUInt16SelectionVector. + * + * Since: 4.0.0 + */ +GGandivaUInt16SelectionVector * +ggandiva_uint16_selection_vector_new(gint64 max_slots, + GError **error) +{ + auto memory_pool = arrow::default_memory_pool(); + std::shared_ptr gandiva_selection_vector; + auto status = gandiva::SelectionVector::MakeInt16(max_slots, + memory_pool, + &gandiva_selection_vector); + if (garrow_error_check(error, + status, + "[gandiva][uint16-selection-vector][new]")) { + return GGANDIVA_UINT16_SELECTION_VECTOR( + ggandiva_selection_vector_new_raw(&gandiva_selection_vector)); + } else { + return NULL; + } +} + + +G_DEFINE_TYPE(GGandivaUInt32SelectionVector, + ggandiva_uint32_selection_vector, + GGANDIVA_TYPE_SELECTION_VECTOR) + +static void +ggandiva_uint32_selection_vector_init( + GGandivaUInt32SelectionVector *selection_vector) +{ +} + +static void +ggandiva_uint32_selection_vector_class_init( + GGandivaUInt32SelectionVectorClass *klass) +{ +} + +/** + * ggandiva_uint32_selection_vector_new: + * @max_slots: The max number of slots. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GGandivaUInt32SelectionVector. + * + * Since: 4.0.0 + */ +GGandivaUInt32SelectionVector * +ggandiva_uint32_selection_vector_new(gint64 max_slots, + GError **error) +{ + auto memory_pool = arrow::default_memory_pool(); + std::shared_ptr gandiva_selection_vector; + auto status = gandiva::SelectionVector::MakeInt32(max_slots, + memory_pool, + &gandiva_selection_vector); + if (garrow_error_check(error, + status, + "[gandiva][uint32-selection-vector][new]")) { + return GGANDIVA_UINT32_SELECTION_VECTOR( + ggandiva_selection_vector_new_raw(&gandiva_selection_vector)); + } else { + return NULL; + } +} + + +G_DEFINE_TYPE(GGandivaUInt64SelectionVector, + ggandiva_uint64_selection_vector, + GGANDIVA_TYPE_SELECTION_VECTOR) + +static void +ggandiva_uint64_selection_vector_init( + GGandivaUInt64SelectionVector *selection_vector) +{ +} + +static void +ggandiva_uint64_selection_vector_class_init( + GGandivaUInt64SelectionVectorClass *klass) +{ +} + +/** + * ggandiva_uint64_selection_vector_new: + * @max_slots: The max number of slots. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GGandivaUInt64SelectionVector. + * + * Since: 4.0.0 + */ +GGandivaUInt64SelectionVector * +ggandiva_uint64_selection_vector_new(gint64 max_slots, + GError **error) +{ + auto memory_pool = arrow::default_memory_pool(); + std::shared_ptr gandiva_selection_vector; + auto status = gandiva::SelectionVector::MakeInt64(max_slots, + memory_pool, + &gandiva_selection_vector); + if (garrow_error_check(error, + status, + "[gandiva][uint64-selection-vector][new]")) { + return GGANDIVA_UINT64_SELECTION_VECTOR( + ggandiva_selection_vector_new_raw(&gandiva_selection_vector)); + } else { + return NULL; + } +} + + +G_END_DECLS + + +GGandivaSelectionVector * +ggandiva_selection_vector_new_raw( + std::shared_ptr *gandiva_selection_vector) +{ + GType type = GGANDIVA_TYPE_SELECTION_VECTOR; + switch ((*gandiva_selection_vector)->GetMode()) { + case gandiva::SelectionVector::Mode::MODE_UINT16: + type = GGANDIVA_TYPE_UINT16_SELECTION_VECTOR; + break; + case gandiva::SelectionVector::Mode::MODE_UINT32: + type = GGANDIVA_TYPE_UINT32_SELECTION_VECTOR; + break; + case gandiva::SelectionVector::Mode::MODE_UINT64: + type = GGANDIVA_TYPE_UINT64_SELECTION_VECTOR; + break; + default: + break; + } + auto selection_vector = + g_object_new(type, + "selection-vector", gandiva_selection_vector, + NULL); + return GGANDIVA_SELECTION_VECTOR(selection_vector); +} + +std::shared_ptr +ggandiva_selection_vector_get_raw(GGandivaSelectionVector *selection_vector) +{ + auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(selection_vector); + return priv->selection_vector; +} diff --git a/c_glib/gandiva-glib/selection-vector.h b/c_glib/gandiva-glib/selection-vector.h new file mode 100644 index 00000000000..029c4cde5ca --- /dev/null +++ b/c_glib/gandiva-glib/selection-vector.h @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +G_BEGIN_DECLS + +/** + * GGandivaSelectionVectorMode: + * @GGANDIVA_SELECTION_VECTOR_MODE_NONE: Selection vector isn't used. + * @GGANDIVA_SELECTION_VECTOR_MODE_UINT16: + * #GGandivaUInt16SelectionVector is used. + * @GGANDIVA_SELECTION_VECTOR_MODE_UINT32: + * #GGandivaUInt32SelectionVector is used. + * @GGANDIVA_SELECTION_VECTOR_MODE_UINT64: + * #GGandivaUInt64SelectionVector is used. + * + * They are corresponding to `gandiva::SelectionVector::Mode` values. + * + * Since: 4.0.0 + */ +typedef enum { + GGANDIVA_SELECTION_VECTOR_MODE_NONE, + GGANDIVA_SELECTION_VECTOR_MODE_UINT16, + GGANDIVA_SELECTION_VECTOR_MODE_UINT32, + GGANDIVA_SELECTION_VECTOR_MODE_UINT64, +} GGandivaSelectionVectorMode; + + +#define GGANDIVA_TYPE_SELECTION_VECTOR (ggandiva_selection_vector_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaSelectionVector, + ggandiva_selection_vector, + GGANDIVA, + SELECTION_VECTOR, + GObject) + +struct _GGandivaSelectionVectorClass +{ + GObjectClass parent_class; +}; + +GGANDIVA_AVAILABLE_IN_4_0 +GGandivaSelectionVectorMode +ggandiva_selection_vector_get_mode(GGandivaSelectionVector *selection_vector); + +GGANDIVA_AVAILABLE_IN_4_0 +GArrowArray * +ggandiva_selection_vector_to_array(GGandivaSelectionVector *selection_vector); + + +#define GGANDIVA_TYPE_UINT16_SELECTION_VECTOR \ + (ggandiva_uint16_selection_vector_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt16SelectionVector, + ggandiva_uint16_selection_vector, + GGANDIVA, + UINT16_SELECTION_VECTOR, + GGandivaSelectionVector) + +struct _GGandivaUInt16SelectionVectorClass +{ + GGandivaSelectionVectorClass parent_class; +}; + +GGANDIVA_AVAILABLE_IN_4_0 +GGandivaUInt16SelectionVector * +ggandiva_uint16_selection_vector_new(gint64 max_slots, + GError **error); + + +#define GGANDIVA_TYPE_UINT32_SELECTION_VECTOR \ + (ggandiva_uint32_selection_vector_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt32SelectionVector, + ggandiva_uint32_selection_vector, + GGANDIVA, + UINT32_SELECTION_VECTOR, + GGandivaSelectionVector) + +struct _GGandivaUInt32SelectionVectorClass +{ + GGandivaSelectionVectorClass parent_class; +}; + +GGANDIVA_AVAILABLE_IN_4_0 +GGandivaUInt32SelectionVector * +ggandiva_uint32_selection_vector_new(gint64 max_slots, + GError **error); + + +#define GGANDIVA_TYPE_UINT64_SELECTION_VECTOR \ + (ggandiva_uint64_selection_vector_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt64SelectionVector, + ggandiva_uint64_selection_vector, + GGANDIVA, + UINT64_SELECTION_VECTOR, + GGandivaSelectionVector) + +struct _GGandivaUInt64SelectionVectorClass +{ + GGandivaSelectionVectorClass parent_class; +}; + +GGANDIVA_AVAILABLE_IN_4_0 +GGandivaUInt64SelectionVector * +ggandiva_uint64_selection_vector_new(gint64 max_slots, + GError **error); + + +G_END_DECLS diff --git a/c_glib/gandiva-glib/selection-vector.hpp b/c_glib/gandiva-glib/selection-vector.hpp new file mode 100644 index 00000000000..aec583141e9 --- /dev/null +++ b/c_glib/gandiva-glib/selection-vector.hpp @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +#include + +GGandivaSelectionVector * +ggandiva_selection_vector_new_raw( + std::shared_ptr *gandiva_selection_vector); +std::shared_ptr +ggandiva_selection_vector_get_raw(GGandivaSelectionVector *selection_vector); diff --git a/c_glib/gandiva-glib/version.h.in b/c_glib/gandiva-glib/version.h.in index 85cfe6d0cfb..3c9e87c9d52 100644 --- a/c_glib/gandiva-glib/version.h.in +++ b/c_glib/gandiva-glib/version.h.in @@ -119,6 +119,15 @@ */ #define GGANDIVA_VERSION_1_0 G_ENCODE_VERSION(1, 0) +/** + * GGANDIVA_VERSION_4_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 4.0.0 + */ +#define GGANDIVA_VERSION_4_0 G_ENCODE_VERSION(4, 0) + /** * GGANDIVA_VERSION_MIN_REQUIRED: * @@ -166,6 +175,20 @@ #define GGANDIVA_AVAILABLE_IN_ALL +#if GGANDIVA_VERSION_MIN_REQUIRED >= GGANDIVA_VERSION_4_0 +# define GGANDIVA_DEPRECATED_IN_4_0 GGANDIVA_DEPRECATED +# define GGANDIVA_DEPRECATED_IN_4_0_FOR(function) GGANDIVA_DEPRECATED_FOR(function) +#else +# define GGANDIVA_DEPRECATED_IN_4_0 +# define GGANDIVA_DEPRECATED_IN_4_0_FOR(function) +#endif + +#if GGANDIVA_VERSION_MAX_ALLOWED < GGANDIVA_VERSION_4_0 +# define GGANDIVA_AVAILABLE_IN_4_0 GGANDIVA_UNAVAILABLE(4, 0) +#else +# define GGANDIVA_AVAILABLE_IN_4_0 +#endif + #if GGANDIVA_VERSION_MIN_REQUIRED >= GGANDIVA_VERSION_1_0 # define GGANDIVA_DEPRECATED_IN_1_0 GGANDIVA_DEPRECATED # define GGANDIVA_DEPRECATED_IN_1_0_FOR(function) GGANDIVA_DEPRECATED_FOR(function) diff --git a/c_glib/parquet-glib/Makefile.am b/c_glib/parquet-glib/Makefile.am deleted file mode 100644 index a813b3ce9cc..00000000000 --- a/c_glib/parquet-glib/Makefile.am +++ /dev/null @@ -1,145 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPARQUET_CFLAGS) - -if HAVE_PARQUET -lib_LTLIBRARIES = \ - libparquet-glib.la - -libparquet_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(PARQUET_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPARQUET_CXXFLAGS) - -libparquet_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libparquet_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(PARQUET_LIBS) - -libparquet_glib_la_headers = \ - arrow-file-reader.h \ - arrow-file-writer.h \ - parquet-glib.h - -libparquet_glib_la_generated_headers = \ - version.h - -libparquet_glib_la_sources = \ - arrow-file-reader.cpp \ - arrow-file-writer.cpp \ - $(libparquet_glib_la_headers) \ - $(libparquet_glib_la_generated_headers) - -libparquet_glib_la_cpp_headers = \ - arrow-file-reader.hpp \ - arrow-file-writer.hpp \ - parquet-glib.hpp - -libparquet_glib_la_SOURCES = \ - $(libparquet_glib_la_sources) \ - $(libparquet_glib_la_cpp_headers) - -BUILT_SOURCES = \ - $(libparquet_glib_la_generated_headers) - -parquet_glib_includedir = $(includedir)/parquet-glib -parquet_glib_include_HEADERS = \ - $(libparquet_glib_la_headers) \ - $(libparquet_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - parquet-glib.pc - -# GObject Introspection -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_top_builddir)/arrow-glib - -Parquet-1.0.gir: libparquet-glib.la -Parquet_1_0_gir_PACKAGES = \ - arrow-glib -Parquet_1_0_gir_EXPORT_PACKAGES = \ - parquet-glib -Parquet_1_0_gir_INCLUDES = \ - Arrow-1.0 -Parquet_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Parquet_1_0_gir_LIBS = -Parquet_1_0_gir_FILES = $(libparquet_glib_la_sources) -Parquet_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_top_builddir)/arrow-glib \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GParquet \ - --symbol-prefix=gparquet -if OS_MACOS -Parquet_1_0_gir_LIBS += \ - arrow-glib \ - parquet-glib -Parquet_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_top_builddir)/arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -Parquet_1_0_gir_LIBS += \ - $(abs_top_builddir)/arrow-glib/libarrow-glib.la \ - libparquet-glib.la -endif -INTROSPECTION_GIRS += Parquet-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/parquet-glib/parquet-glib.pc.in b/c_glib/parquet-glib/parquet-glib.pc.in deleted file mode 100644 index 81559f1bce1..00000000000 --- a/c_glib/parquet-glib/parquet-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Parquet GLib -Description: C API for Apache Parquet based on GLib -Version: @VERSION@ -Libs: -L${libdir} -lparquet-glib -Cflags: -I${includedir} -Requires: arrow-glib diff --git a/c_glib/plasma-glib/Makefile.am b/c_glib/plasma-glib/Makefile.am deleted file mode 100644 index 60499a4065f..00000000000 --- a/c_glib/plasma-glib/Makefile.am +++ /dev/null @@ -1,171 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) \ - -DG_LOG_DOMAIN=\"Plasma\" - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPLASMA_CFLAGS) - -PLASMA_ARROW_CUDA_LIBS = -PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS = -PLASMA_GIR_ARROW_CUDA_PACKAGE = -PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH = -PLASMA_GIR_ARROW_CUDA_LIBS_MACOS = -PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS = -PLASMA_GIR_ARROW_CUDA_LIBS = -if HAVE_ARROW_CUDA -PLASMA_ARROW_CUDA_LIBS += \ - ../arrow-cuda-glib/libarrow-cuda-glib.la \ - $(ARROW_CUDA_LIBS) -PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS += \ - --includedir=$(abs_top_builddir)/arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_PACKAGE += \ - arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH += \ - --add-include-path=$(abs_top_builddir)/arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_LIBS_MACOS += \ - arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS += \ - --library-path=$(abs_top_builddir)/arrow-cuda-glib/.libs -PLASMA_GIR_ARROW_CUDA_LIBS += \ - $(abs_top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la -endif - -if HAVE_PLASMA -lib_LTLIBRARIES = \ - libplasma-glib.la - -libplasma_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(PLASMA_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPLASMA_CFLAGS) - -libplasma_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libplasma_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(PLASMA_LIBS) \ - $(PLASMA_ARROW_CUDA_LIBS) - -libplasma_glib_la_headers = \ - client.h \ - object.h \ - plasma-glib.h - -libplasma_glib_la_sources = \ - client.cpp \ - object.cpp \ - $(libplasma_glib_la_headers) - -libplasma_glib_la_cpp_headers = \ - client.hpp \ - object.hpp \ - plasma-glib.hpp - -libplasma_glib_la_SOURCES = \ - $(libplasma_glib_la_sources) \ - $(libplasma_glib_la_cpp_headers) - -plasma_glib_includedir = $(includedir)/plasma-glib -plasma_glib_include_HEADERS = \ - $(libplasma_glib_la_headers) \ - $(libplasma_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - plasma-glib.pc - -# GObject Introspection -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_top_builddir)/arrow-glib \ - $(PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_INCLUDEDIR) - -Plasma-1.0.gir: libplasma-glib.la -Plasma_1_0_gir_PACKAGES = \ - arrow-glib \ - $(PLASMA_GIR_ARROW_CUDA_PACKAGE) -Plasma_1_0_gir_EXPORT_PACKAGES = \ - plasma-glib -Plasma_1_0_gir_INCLUDES = \ - Arrow-1.0 -Plasma_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Plasma_1_0_gir_LIBS = -Plasma_1_0_gir_FILES = $(libplasma_glib_la_sources) -Plasma_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_top_builddir)/arrow-glib \ - $(PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH) \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GPlasma \ - --symbol-prefix=gplasma -if OS_MACOS -Plasma_1_0_gir_LIBS += \ - arrow-glib \ - $(PLASMA_GIR_ARROW_CUDA_LIBS_MACOS) \ - plasma-glib -Plasma_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_top_builddir)/arrow-glib/.libs \ - $(PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS) \ - --library-path=$(abs_builddir)/.libs -else -Plasma_1_0_gir_LIBS += \ - $(abs_top_builddir)/arrow-glib/libarrow-glib.la \ - $(PLASMA_GIR_ARROW_CUDA_LIBS) \ - libplasma-glib.la -endif -INTROSPECTION_GIRS += Plasma-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/plasma-glib/plasma-glib.pc.in b/c_glib/plasma-glib/plasma-glib.pc.in deleted file mode 100644 index c82fe69580f..00000000000 --- a/c_glib/plasma-glib/plasma-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Arrow Plasma GLib -Description: C API for Apache Arrow Plasma based on GLib -Version: @VERSION@ -Libs: -L${libdir} -lplasma-glib -Cflags: -I${includedir} -Requires: plasma arrow-glib @ARROW_CUDA_GLIB_PACKAGE@ diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in b/c_glib/test/gandiva/test-condition.rb similarity index 51% rename from c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in rename to c_glib/test/gandiva/test-condition.rb index ee7e13967df..51fb9f1b160 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in +++ b/c_glib/test/gandiva/test-condition.rb @@ -15,14 +15,21 @@ # specific language governing permissions and limitations # under the License. -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ +class TestGandivaCondition < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new) + field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new) + field1_node = Gandiva::FieldNode.new(field1) + field2_node = Gandiva::FieldNode.new(field2) + function_node = Gandiva::FunctionNode.new("equal", + [field1_node, field2_node], + Arrow::BooleanDataType.new) + @condition = Gandiva::Condition.new(function_node) + end -Name: Apache Arrow Dataset GLib -Description: C API for Apache Arrow Dataset based on GLib -Version: @VERSION@ -Libs: -L${libdir} -larrow-dataset-glib -Cflags: -I${includedir} -Requires: arrow-glib arrow-dataset + def test_to_s + assert_equal("bool equal((int32) field1, (int32) field2)", + @condition.to_s) + end +end diff --git a/c_glib/test/gandiva/test-filter.rb b/c_glib/test/gandiva/test-filter.rb new file mode 100644 index 00000000000..3da77743174 --- /dev/null +++ b/c_glib/test/gandiva/test-filter.rb @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaFilter < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + + field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new) + field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new) + schema = Arrow::Schema.new([field1, field2]) + field_node1 = Gandiva::FieldNode.new(field1) + field_node2 = Gandiva::FieldNode.new(field2) + equal_function_node = + Gandiva::FunctionNode.new("equal", + [field_node1, field_node2], + Arrow::BooleanDataType.new) + condition = Gandiva::Condition.new(equal_function_node) + @filter = Gandiva::Filter.new(schema, condition) + + input_arrays = [ + build_int32_array([1, 2, 3, 4]), + build_int32_array([11, 2, 15, 4]), + ] + @record_batch = Arrow::RecordBatch.new(schema, + input_arrays[0].length, + input_arrays) + end + + def test_evaluate + selection_vector = Gandiva::UInt16SelectionVector.new(@record_batch.n_rows) + @filter.evaluate(@record_batch, selection_vector) + assert_equal(build_uint16_array([1, 3]), + selection_vector.to_array) + end +end diff --git a/c_glib/test/gandiva/test-projector.rb b/c_glib/test/gandiva/test-projector.rb index 4d3375659ae..308e1c3a5c9 100644 --- a/c_glib/test/gandiva/test-projector.rb +++ b/c_glib/test/gandiva/test-projector.rb @@ -20,33 +20,40 @@ class TestGandivaProjector < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) - end - def test_evaluate field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new) field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new) - schema = Arrow::Schema.new([field1, field2]) - field_node1 = Gandiva::FieldNode.new(field1) - field_node2 = Gandiva::FieldNode.new(field2) - add_function_node = Gandiva::FunctionNode.new("add", - [field_node1, field_node2], - Arrow::Int32DataType.new) - subtract_function_node = Gandiva::FunctionNode.new("subtract", - [field_node1, field_node2], - Arrow::Int32DataType.new) + @schema = Arrow::Schema.new([field1, field2]) + @field_node1 = Gandiva::FieldNode.new(field1) + @field_node2 = Gandiva::FieldNode.new(field2) + add_function_node = + Gandiva::FunctionNode.new("add", + [@field_node1, @field_node2], + Arrow::Int32DataType.new) + subtract_function_node = + Gandiva::FunctionNode.new("subtract", + [@field_node1, @field_node2], + Arrow::Int32DataType.new) add_result = Arrow::Field.new("add_result", Arrow::Int32DataType.new) add_expression = Gandiva::Expression.new(add_function_node, add_result) - subtract_result = Arrow::Field.new("subtract_result", Arrow::Int32DataType.new) - subtract_expression = Gandiva::Expression.new(subtract_function_node, subtract_result) + subtract_result = Arrow::Field.new("subtract_result", + Arrow::Int32DataType.new) + subtract_expression = Gandiva::Expression.new(subtract_function_node, + subtract_result) + @projector = Gandiva::Projector.new(@schema, + [add_expression, subtract_expression]) - projector = Gandiva::Projector.new(schema, - [add_expression, subtract_expression]) input_arrays = [ build_int32_array([1, 2, 3, 4]), build_int32_array([11, 13, 15, 17]), ] - record_batch = Arrow::RecordBatch.new(schema, 4, input_arrays) - outputs = projector.evaluate(record_batch) + @record_batch = Arrow::RecordBatch.new(@schema, + input_arrays[0].length, + input_arrays) + end + + def test_evaluate + outputs = @projector.evaluate(@record_batch) assert_equal([ [12, 15, 18, 21], [-10, -11, -12, -13], diff --git a/c_glib/test/gandiva/test-selectable-projector.rb b/c_glib/test/gandiva/test-selectable-projector.rb new file mode 100644 index 00000000000..47b0059a2ef --- /dev/null +++ b/c_glib/test/gandiva/test-selectable-projector.rb @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaSelectableProjector < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + + field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new) + field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new) + @schema = Arrow::Schema.new([field1, field2]) + + input_arrays = [ + build_int32_array([1, 2, 3, 4]), + build_int32_array([11, 13, 15, 17]), + ] + @record_batch = Arrow::RecordBatch.new(@schema, + input_arrays[0].length, + input_arrays) + + @field_node1 = Gandiva::FieldNode.new(field1) + @field_node2 = Gandiva::FieldNode.new(field2) + add_function_node = + Gandiva::FunctionNode.new("add", + [@field_node1, @field_node2], + Arrow::Int32DataType.new) + subtract_function_node = + Gandiva::FunctionNode.new("subtract", + [@field_node1, @field_node2], + Arrow::Int32DataType.new) + add_result = Arrow::Field.new("add_result", Arrow::Int32DataType.new) + add_expression = Gandiva::Expression.new(add_function_node, add_result) + subtract_result = Arrow::Field.new("subtract_result", + Arrow::Int32DataType.new) + subtract_expression = Gandiva::Expression.new(subtract_function_node, + subtract_result) + @selection_vector = Gandiva::UInt16SelectionVector.new(@record_batch.n_rows) + @projector = + Gandiva::SelectableProjector.new(@schema, + [add_expression, subtract_expression], + @selection_vector.mode) + end + + def test_evaluate + two_node = Gandiva::Int32LiteralNode.new(2) + condition_node = Gandiva::FunctionNode.new("greater_than", + [@field_node1, two_node], + Arrow::BooleanDataType.new) + condition = Gandiva::Condition.new(condition_node) + filter = Gandiva::Filter.new(@schema, condition) + filter.evaluate(@record_batch, @selection_vector) + outputs = @projector.evaluate(@record_batch, @selection_vector) + assert_equal([ + [18, 21], + [-12, -13], + ], + outputs.collect(&:values)) + end +end diff --git a/dev/release/source/build.sh b/c_glib/test/gandiva/test-selection-vector.rb old mode 100755 new mode 100644 similarity index 53% rename from dev/release/source/build.sh rename to c_glib/test/gandiva/test-selection-vector.rb index 558600e1fb7..ca5042c2874 --- a/dev/release/source/build.sh +++ b/c_glib/test/gandiva/test-selection-vector.rb @@ -1,5 +1,3 @@ -#!/bin/bash -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,17 +15,28 @@ # specific language governing permissions and limitations # under the License. -set -e +class TestGandivaSelectionVector < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end -archive_name=$1 -c_glib_including_configure_tar_gz=$2 + def test_uint16 + selection_vector = Gandiva::UInt16SelectionVector.new(10) + assert_equal(build_uint16_array([]), + selection_vector.to_array) + end -tar xf /arrow/${archive_name}.tar + def test_uint32 + selection_vector = Gandiva::UInt32SelectionVector.new(10) + assert_equal(build_uint32_array([]), + selection_vector.to_array) + end -# Run autogen.sh to create c_glib/ source archive containing the configure script -cd ${archive_name}/c_glib -./autogen.sh -rm -rf autom4te.cache -cd - -mv ${archive_name}/c_glib/ c_glib/ -tar czf /arrow/${c_glib_including_configure_tar_gz} c_glib + def test_uint64 + selection_vector = Gandiva::UInt64SelectionVector.new(10) + assert_equal(build_uint64_array([]), + selection_vector.to_array) + end +end diff --git a/ci/docker/ubuntu-16.04-cpp.dockerfile b/ci/docker/ubuntu-16.04-cpp.dockerfile deleted file mode 100644 index 5c98ae30e1e..00000000000 --- a/ci/docker/ubuntu-16.04-cpp.dockerfile +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base=amd64/ubuntu:16.04 -FROM ${base} - -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -ENV DEBIAN_FRONTEND noninteractive - -# LLVM 10 or later requires C++ 14 but g++-5's C++ 14 support is limited. -# cpp/src/arrow/vendored/datetime/date.h doesn't work. -# ARG llvm -ENV llvm=8 -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - software-properties-common \ - wget && \ - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - apt-add-repository -y "deb https://apt.llvm.org/xenial/ llvm-toolchain-xenial-${llvm} main" && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - autoconf \ - ca-certificates \ - ccache \ - clang-${llvm} \ - cmake \ - g++ \ - gcc \ - gdb \ - git \ - libboost-all-dev \ - libbrotli-dev \ - libbz2-dev \ - libgoogle-glog-dev \ - liblz4-dev \ - libre2-dev \ - libssl-dev \ - libutf8proc-dev \ - libzstd1-dev \ - llvm-${llvm}-dev \ - make \ - ninja-build \ - pkg-config \ - protobuf-compiler \ - python3 \ - tzdata && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Benchmark is deactivated as the external project requires CMake 3.6+ -# Gandiva JNI is deactivated as it requires CMake 3.11+ -# - c-ares in Xenial isn't recognized by gRPC build system -# - libprotobuf-dev / libprotoc-dev in Xenial too old for gRPC -# - libboost-all-dev does not include Boost.Process, needed for Flight -# unit tests, so doing vendored build by default -ENV ARROW_BUILD_BENCHMARKS=OFF \ - ARROW_BUILD_TESTS=ON \ - ARROW_DATASET=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_GANDIVA_JAVA=OFF \ - ARROW_GANDIVA=ON \ - ARROW_HOME=/usr/local \ - ARROW_PARQUET=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - BOOST_SOURCE=BUNDLED \ - cares_SOURCE=BUNDLED \ - CC=gcc \ - CXX=g++ \ - gRPC_SOURCE=BUNDLED \ - GTest_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ - PARQUET_BUILD_EXAMPLES=ON \ - PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH \ - Protobuf_SOURCE=BUNDLED \ - RapidJSON_SOURCE=BUNDLED \ - Snappy_SOURCE=BUNDLED \ - Thrift_SOURCE=BUNDLED diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 1d9e41bba7a..c5b55eef42a 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -79,8 +79,10 @@ build() { export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include" export LIBS="-L${MINGW_PREFIX}/libs" export ARROW_S3=OFF + export ARROW_WITH_RE2=OFF else export ARROW_S3=ON + export ARROW_WITH_RE2=ON fi MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \ @@ -105,6 +107,7 @@ build() { -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \ -DARROW_WITH_SNAPPY=ON \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh index bb53bd82131..7093be4d238 100755 --- a/ci/scripts/go_build.sh +++ b/ci/scripts/go_build.sh @@ -27,3 +27,10 @@ go get -d -t -v ./... go install -v ./... popd + +pushd ${source_dir}/parquet + +go get -d -t -v ./... +go install -v ./... + +popd diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh index 077749fc945..7dd873df3e1 100755 --- a/ci/scripts/go_test.sh +++ b/ci/scripts/go_test.sh @@ -28,3 +28,11 @@ for d in $(go list ./... | grep -v vendor); do done popd + +pushd ${source_dir}/parquet + +for d in $(go list ./... | grep -v vendor); do + go test $d +done + +popd diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index aa23e5b7c18..5d2e71916ed 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -33,3 +33,4 @@ archery integration --with-all --run-flight \ --gold-dirs=$gold_dir/1.0.0-bigendian \ --gold-dirs=$gold_dir/1.0.0-littleendian \ --gold-dirs=$gold_dir/2.0.0-compression \ + --gold-dirs=$gold_dir/4.0.0-shareddict \ diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh index 3f451e96b83..cb6ca30a64e 100755 --- a/ci/scripts/msys2_setup.sh +++ b/ci/scripts/msys2_setup.sh @@ -61,6 +61,7 @@ esac pacman \ --needed \ --noconfirm \ + --refresh \ --sync \ "${packages[@]}" diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh new file mode 100755 index 00000000000..7a021f70f74 --- /dev/null +++ b/ci/scripts/python_wheel_macos_build.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +source_dir=${1} +build_dir=${2} + +echo "=== (${PYTHON_VERSION}) Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf ${build_dir}/install +rm -rf ${source_dir}/python/dist +rm -rf ${source_dir}/python/build +rm -rf ${source_dir}/python/repaired_wheels +rm -rf ${source_dir}/python/pyarrow/*.so +rm -rf ${source_dir}/python/pyarrow/*.so.* + +echo "=== (${PYTHON_VERSION}) Set OSX SDK and C flags ===" +# Arrow is 64-bit-only at the moment +export CFLAGS="-fPIC -arch x86_64 ${CFLAGS//-arch i386/}" +export CXXFLAGS="-fPIC -arch x86_64 ${CXXFLAGS//-arch i386} -std=c++11" +export SDKROOT="$(xcrun --show-sdk-path)" + +echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" +: ${ARROW_DATASET:=ON} +: ${ARROW_FLIGHT:=ON} +: ${ARROW_GANDIVA:=OFF} +: ${ARROW_HDFS:=ON} +: ${ARROW_JEMALLOC:=ON} +: ${ARROW_MIMALLOC:=ON} +: ${ARROW_ORC:=ON} +: ${ARROW_PARQUET:=ON} +: ${ARROW_PLASMA:=ON} +: ${ARROW_S3:=ON} +: ${ARROW_TENSORFLOW:=ON} +: ${ARROW_WITH_BROTLI:=ON} +: ${ARROW_WITH_BZ2:=ON} +: ${ARROW_WITH_LZ4:=ON} +: ${ARROW_WITH_SNAPPY:=ON} +: ${ARROW_WITH_ZLIB:=ON} +: ${ARROW_WITH_ZSTD:=ON} +: ${CMAKE_BUILD_TYPE:=release} +: ${CMAKE_GENERATOR:=Ninja} +: ${VCPKG_FEATURE_FLAGS:=-manifests} +: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-osx-static-${CMAKE_BUILD_TYPE}}} + +mkdir -p ${build_dir}/build +pushd ${build_dir}/build +cmake \ + -DARROW_BUILD_SHARED=ON \ + -DARROW_BUILD_STATIC=OFF \ + -DARROW_BUILD_TESTS=OFF \ + -DARROW_DATASET=${ARROW_DATASET} \ + -DARROW_DEPENDENCY_SOURCE="VCPKG" \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_FLIGHT==${ARROW_FLIGHT} \ + -DARROW_GANDIVA=${ARROW_GANDIVA} \ + -DARROW_HDFS=${ARROW_HDFS} \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ + -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ + -DARROW_ORC=${ARROW_ORC} \ + -DARROW_PACKAGE_KIND="manylinux${MANYLINUX_VERSION}" \ + -DARROW_PARQUET=${ARROW_PARQUET} \ + -DARROW_PLASMA=${ARROW_PLASMA} \ + -DARROW_PYTHON=ON \ + -DARROW_RPATH_ORIGIN=ON \ + -DARROW_S3=${ARROW_S3} \ + -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \ + -DARROW_USE_CCACHE=ON \ + -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \ + -DARROW_WITH_BZ2=${ARROW_WITH_BZ2} \ + -DARROW_WITH_LZ4=${ARROW_WITH_LZ4} \ + -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY} \ + -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB} \ + -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD} \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${build_dir}/install \ + -DCMAKE_UNITY_BUILD=ON \ + -DOPENSSL_USE_STATIC_LIBS=ON \ + -DVCPKG_MANIFEST_MODE=OFF \ + -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ + -G ${CMAKE_GENERATOR} \ + ${source_dir}/cpp +cmake --build . --target install +popd + +# Check that we don't expose any unwanted symbols +# check_arrow_visibility + +echo "=== (${PYTHON_VERSION}) Building wheel ===" +export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} +export PYARROW_BUNDLE_ARROW_CPP=1 +export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} +export PYARROW_INSTALL_TESTS=1 +export PYARROW_WITH_DATASET=${ARROW_DATASET} +export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} +export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} +export PYARROW_WITH_HDFS=${ARROW_HDFS} +export PYARROW_WITH_ORC=${ARROW_ORC} +export PYARROW_WITH_PARQUET=${ARROW_PARQUET} +export PYARROW_WITH_PLASMA=${ARROW_PLASMA} +export PYARROW_WITH_S3=${ARROW_S3} +# PyArrow build configuration +export PKG_CONFIG_PATH=/usr/lib/pkgconfig:${build_dir}/install/lib/pkgconfig + +pushd ${source_dir}/python +python setup.py bdist_wheel +popd + +echo "=== (${PYTHON_VERSION}) Show dynamic libraries the wheel depend on ===" +deps=$(delocate-listdeps ${source_dir}/python/dist/*.whl) + +if echo $deps | grep -v "^@rpath/lib\(arrow\|gandiva\|parquet\|plasma\)"; then + echo "There are non-bundled shared library dependencies." + exit 1 +fi diff --git a/ci/scripts/python_wheel_macos_test.sh b/ci/scripts/python_wheel_macos_test.sh new file mode 100755 index 00000000000..6ac8576d484 --- /dev/null +++ b/ci/scripts/python_wheel_macos_test.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +source_dir=${1} + +: ${ARROW_S3:=ON} + +export PYARROW_TEST_CYTHON=OFF +export PYARROW_TEST_DATASET=ON +export PYARROW_TEST_GANDIVA=OFF +export PYARROW_TEST_HDFS=ON +export PYARROW_TEST_ORC=ON +export PYARROW_TEST_PANDAS=ON +export PYARROW_TEST_PARQUET=ON +export PYARROW_TEST_PLASMA=ON +export PYARROW_TEST_S3=${ARROW_S3} +export PYARROW_TEST_TENSORFLOW=ON +export PYARROW_TEST_FLIGHT=ON + +export ARROW_TEST_DATA=${source_dir}/testing/data +export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data + +# Install the built wheels +pip install ${source_dir}/python/dist/*.whl + +# Test that the modules are importable +python -c " +import pyarrow +import pyarrow._hdfs +import pyarrow.csv +import pyarrow.dataset +import pyarrow.flight +import pyarrow.fs +import pyarrow.json +import pyarrow.orc +import pyarrow.parquet +import pyarrow.plasma +" + +if [ "${PYARROW_TEST_S3}" == "ON" ]; then + python -c "import pyarrow._s3fs" +fi + +# Install testing dependencies +pip install -r ${source_dir}/python/requirements-wheel-test.txt + +# Execute unittest +pytest -r s --pyargs pyarrow diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index eacee5f17f3..89963eb2dd8 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -27,7 +27,7 @@ pushd ${source_dir}/tests export TEST_R_WITH_ARROW=TRUE export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" -${R_BIN} < testthat.R > testthat.out 2>&1 +${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } cat testthat.out if grep -q "runtime error" testthat.out; then diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 151e71b8a7c..d447bdf23dd 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -62,7 +62,7 @@ BEFORE=$(ls -alh ~/) SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') if (as_cran) { - rcmdcheck::rcmdcheck(args = c('--as-cran', '--run-donttest'), error_on = 'warning', check_dir = 'check') + rcmdcheck::rcmdcheck(args = c('--as-cran', '--run-donttest'), error_on = 'warning', check_dir = 'check', timeout = 3600) } else { if (nzchar(Sys.which('minio'))) { message('Running minio for S3 tests (if build supports them)') @@ -71,7 +71,7 @@ SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') pid <- sys::exec_background('minio', c('server', minio_dir)) on.exit(tools::pskill(pid)) } - rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check') + rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check', timeout = 3600) }" echo "$SCRIPT" | ${R_BIN} --no-save diff --git a/ci/vcpkg/arm64-linux-static-debug.cmake b/ci/vcpkg/arm64-linux-static-debug.cmake index 5d77b8df7fa..6fea43694cd 100644 --- a/ci/vcpkg/arm64-linux-static-debug.cmake +++ b/ci/vcpkg/arm64-linux-static-debug.cmake @@ -22,5 +22,7 @@ set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(VCPKG_BUILD_TYPE debug) if(NOT CMAKE_HOST_SYSTEM_PROCESSOR) - execute_process(COMMAND "uname" "-m" OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND "uname" "-m" + OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR + OUTPUT_STRIP_TRAILING_WHITESPACE) endif() diff --git a/ci/vcpkg/arm64-linux-static-release.cmake b/ci/vcpkg/arm64-linux-static-release.cmake index ebe5bc3fa04..4012848b849 100644 --- a/ci/vcpkg/arm64-linux-static-release.cmake +++ b/ci/vcpkg/arm64-linux-static-release.cmake @@ -22,5 +22,7 @@ set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(VCPKG_BUILD_TYPE release) if(NOT CMAKE_HOST_SYSTEM_PROCESSOR) - execute_process(COMMAND "uname" "-m" OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND "uname" "-m" + OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR + OUTPUT_STRIP_TRAILING_WHITESPACE) endif() diff --git a/c_glib/autogen.sh b/ci/vcpkg/x64-osx-static-debug.cmake old mode 100755 new mode 100644 similarity index 79% rename from c_glib/autogen.sh rename to ci/vcpkg/x64-osx-static-debug.cmake index eeca380bea8..e8a321ec71a --- a/c_glib/autogen.sh +++ b/ci/vcpkg/x64-osx-static-debug.cmake @@ -1,5 +1,3 @@ -#!/bin/sh -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,10 +15,11 @@ # specific language governing permissions and limitations # under the License. -set -u -set -e +set(VCPKG_TARGET_ARCHITECTURE x64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) -mkdir -p m4 +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES x86_64) -gtkdocize --copy -autoreconf --install --force +set(VCPKG_BUILD_TYPE debug) diff --git a/c_glib/arrow-glib/arrow-orc-glib.pc.in b/ci/vcpkg/x64-osx-static-release.cmake similarity index 79% rename from c_glib/arrow-glib/arrow-orc-glib.pc.in rename to ci/vcpkg/x64-osx-static-release.cmake index 8e45d402549..956d5b92e73 100644 --- a/c_glib/arrow-glib/arrow-orc-glib.pc.in +++ b/ci/vcpkg/x64-osx-static-release.cmake @@ -15,12 +15,11 @@ # specific language governing permissions and limitations # under the License. -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ +set(VCPKG_TARGET_ARCHITECTURE x64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) -Name: Apache Arrow ORC GLib -Description: ORC modules for Apache Arrow GLib -Version: @VERSION@ -Requires: arrow-glib +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES x86_64) + +set(VCPKG_BUILD_TYPE release) diff --git a/cpp/cmake_modules/FindBoostAlt.cmake b/cpp/cmake_modules/FindBoostAlt.cmake index 123c6dda1c7..1771937125e 100644 --- a/cpp/cmake_modules/FindBoostAlt.cmake +++ b/cpp/cmake_modules/FindBoostAlt.cmake @@ -38,16 +38,14 @@ if(ARROW_BOOST_USE_SHARED) set(BUILD_SHARED_LIBS_KEEP ${BUILD_SHARED_LIBS}) set(BUILD_SHARED_LIBS ON) - find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} - COMPONENTS system filesystem) + find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} COMPONENTS system filesystem) set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP}) unset(BUILD_SHARED_LIBS_KEEP) else() # Find static boost headers and libs # TODO Differentiate here between release and debug builds set(Boost_USE_STATIC_LIBS ON) - find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} - COMPONENTS system filesystem) + find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} COMPONENTS system filesystem) endif() if(Boost_FOUND) diff --git a/cpp/cmake_modules/FindORC.cmake b/cpp/cmake_modules/FindORC.cmake index 1be149c93b2..061a0df2e9e 100644 --- a/cpp/cmake_modules/FindORC.cmake +++ b/cpp/cmake_modules/FindORC.cmake @@ -44,10 +44,9 @@ if(ORC_STATIC_LIB AND ORC_INCLUDE_DIR) add_library(orc::liborc STATIC IMPORTED) set_target_properties(orc::liborc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${ORC_INCLUDE_DIR}") + INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") else() - if (ORC_FIND_REQUIRED) + if(ORC_FIND_REQUIRED) message(FATAL_ERROR "ORC library was required in toolchain and unable to locate") endif() set(ORC_FOUND FALSE) diff --git a/cpp/cmake_modules/FindSnappy.cmake b/cpp/cmake_modules/FindSnappy.cmake index 5784cf59220..26cccb786c5 100644 --- a/cpp/cmake_modules/FindSnappy.cmake +++ b/cpp/cmake_modules/FindSnappy.cmake @@ -26,9 +26,13 @@ if(ARROW_SNAPPY_USE_SHARED) else() set(SNAPPY_STATIC_LIB_NAME_BASE "snappy") if(MSVC) - set(SNAPPY_STATIC_LIB_NAME_BASE "${SNAPPY_STATIC_LIB_NAME_BASE}${SNAPPY_MSVC_STATIC_LIB_SUFFIX}") + set(SNAPPY_STATIC_LIB_NAME_BASE + "${SNAPPY_STATIC_LIB_NAME_BASE}${SNAPPY_MSVC_STATIC_LIB_SUFFIX}") endif() - set(SNAPPY_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set( + SNAPPY_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) endif() if(Snappy_ROOT) @@ -44,7 +48,9 @@ if(Snappy_ROOT) PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) else() find_library(Snappy_LIB NAMES ${SNAPPY_LIB_NAMES}) - find_path(Snappy_INCLUDE_DIR NAMES snappy.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + find_path(Snappy_INCLUDE_DIR + NAMES snappy.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() find_package_handle_standard_args(Snappy REQUIRED_VARS Snappy_LIB Snappy_INCLUDE_DIR) diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake index 560321df5db..edea73b8dae 100644 --- a/cpp/cmake_modules/Findutf8proc.cmake +++ b/cpp/cmake_modules/Findutf8proc.cmake @@ -29,37 +29,40 @@ else() endif() set(utf8proc_STATIC_LIB_SUFFIX "${utf8proc_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(utf8proc_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${utf8proc_STATIC_LIB_SUFFIX}") + set(utf8proc_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${utf8proc_STATIC_LIB_SUFFIX}") endif() if(utf8proc_ROOT) - find_library( - utf8proc_LIB - NAMES ${utf8proc_LIB_NAMES} - PATHS ${utf8proc_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) + find_library(utf8proc_LIB + NAMES ${utf8proc_LIB_NAMES} + PATHS ${utf8proc_ROOT} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} + NO_DEFAULT_PATH) find_path(utf8proc_INCLUDE_DIR NAMES utf8proc.h PATHS ${utf8proc_ROOT} NO_DEFAULT_PATH PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) else() - find_library( - utf8proc_LIB - NAMES ${utf8proc_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_path(utf8proc_INCLUDE_DIR NAMES utf8proc.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + find_library(utf8proc_LIB + NAMES ${utf8proc_LIB_NAMES} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_path(utf8proc_INCLUDE_DIR + NAMES utf8proc.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() -find_package_handle_standard_args(utf8proc REQUIRED_VARS utf8proc_LIB utf8proc_INCLUDE_DIR) +find_package_handle_standard_args(utf8proc REQUIRED_VARS utf8proc_LIB + utf8proc_INCLUDE_DIR) if(utf8proc_FOUND) set(utf8proc_FOUND TRUE) add_library(utf8proc::utf8proc UNKNOWN IMPORTED) - set_target_properties(utf8proc::utf8proc - PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${utf8proc_INCLUDE_DIR}") + set_target_properties( + utf8proc::utf8proc + PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}" INTERFACE_INCLUDE_DIRECTORIES + "${utf8proc_INCLUDE_DIR}") if(NOT ARROW_UTF8PROC_USE_SHARED) set_target_properties(utf8proc::utf8proc PROPERTIES INTERFACE_COMPILER_DEFINITIONS "UTF8PROC_STATIC") diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index b534552c3c0..9f68c560472 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -451,7 +451,9 @@ if(ARROW_CPU_FLAG STREQUAL "armv8") endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}") - add_definitions(-DARROW_HAVE_NEON) + if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") + add_definitions(-DARROW_HAVE_NEON) + endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index bfa3ee15657..05cc642417a 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -405,6 +405,16 @@ else() ) endif() +if(DEFINED ENV{ARROW_BZIP2_URL}) + set(ARROW_BZIP2_SOURCE_URL "$ENV{ARROW_BZIP2_URL}") +else() + set_urls( + ARROW_BZIP2_SOURCE_URL + "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_CARES_URL}) set(CARES_SOURCE_URL "$ENV{ARROW_CARES_URL}") else() @@ -571,6 +581,15 @@ else() ) endif() +if(DEFINED ENV{ARROW_UTF8PROC_URL}) + set(ARROW_UTF8PROC_SOURCE_URL "$ENV{ARROW_UTF8PROC_URL}") +else() + set_urls( + ARROW_UTF8PROC_SOURCE_URL + "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_XSIMD_URL}) set(XSIMD_SOURCE_URL "$ENV{ARROW_XSIMD_URL}") else() @@ -598,30 +617,15 @@ else() ) endif() -if(DEFINED ENV{ARROW_BZIP2_SOURCE_URL}) - set(ARROW_BZIP2_SOURCE_URL "$ENV{ARROW_BZIP2_SOURCE_URL}") -else() - set_urls( - ARROW_BZIP2_SOURCE_URL - "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" - ) -endif() - -if(DEFINED ENV{ARROW_UTF8PROC_SOURCE_URL}) - set(ARROW_UTF8PROC_SOURCE_URL "$ENV{ARROW_UTF8PROC_SOURCE_URL}") -else() - set_urls( - ARROW_UTF8PROC_SOURCE_URL - "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz" - ) -endif() - # ---------------------------------------------------------------------- # ExternalProject options -set(EP_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}") -set(EP_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") +set( + EP_CXX_FLAGS + "${CMAKE_CXX_COMPILER_ARG1} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}" + ) +set(EP_C_FLAGS + "${CMAKE_C_COMPILER_ARG1} ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") if(NOT MSVC_TOOLCHAIN) # Set -fPIC on all external projects @@ -1935,7 +1939,7 @@ macro(build_xsimd) endmacro() # For now xsimd is always bundled from upstream -if(1) +if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") set(xsimd_SOURCE "BUNDLED") resolve_dependency(xsimd) # TODO: Don't use global includes but rather target_include_directories diff --git a/cpp/cmake_modules/Usevcpkg.cmake b/cpp/cmake_modules/Usevcpkg.cmake index 118d850909f..781bec436f3 100644 --- a/cpp/cmake_modules/Usevcpkg.cmake +++ b/cpp/cmake_modules/Usevcpkg.cmake @@ -57,7 +57,7 @@ else() endif() elseif(DEFINED ENV{VCPKG_ROOT}) # Get it from the environment variable VCPKG_ROOT - set(VCPKG_ROOT ENV{VCPKG_ROOT}) + set(VCPKG_ROOT $ENV{VCPKG_ROOT}) find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH) if(NOT _VCPKG_BIN) message( diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 04756aaf8e9..df72dcc5b6b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -373,6 +373,7 @@ if(ARROW_COMPUTE) compute/kernels/aggregate_tdigest.cc compute/kernels/aggregate_var_std.cc compute/kernels/codegen_internal.cc + compute/kernels/hash_aggregate.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc compute/kernels/scalar_cast_boolean.cc diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index d3ae93318ba..db3c640b9a4 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -117,13 +117,13 @@ class BaseBinaryArray : public FlatArray { } } - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: // For subclasses - BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} + BaseBinaryArray() = default; // Protected method for constructors void SetData(const std::shared_ptr& data) { @@ -132,8 +132,8 @@ class BaseBinaryArray : public FlatArray { raw_data_ = data->GetValuesSafe(2, /*offset=*/0); } - const offset_type* raw_value_offsets_; - const uint8_t* raw_data_; + const offset_type* raw_value_offsets_ = NULLPTR; + const uint8_t* raw_data_ = NULLPTR; }; /// Concrete Array class for variable-size binary data @@ -231,9 +231,9 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: void SetData(const std::shared_ptr& data) { diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 1696653850b..a50cbcc13cf 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -20,6 +20,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -197,10 +198,11 @@ class TestListArray : public TestBuilder { } void TestFromArrays() { - std::shared_ptr offsets1, offsets2, offsets3, offsets4, values; + std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; std::vector offsets_is_valid3 = {true, false, true, true}; std::vector offsets_is_valid4 = {true, true, false, true}; + std::vector offsets_is_valid5 = {true, true, false, false}; std::vector values_is_valid = {true, false, true, true, true, true}; @@ -217,6 +219,8 @@ class TestListArray : public TestBuilder { &offsets3); ArrayFromVector(offsets_is_valid4, offset2_values, &offsets4); + ArrayFromVector(offsets_is_valid5, offset2_values, + &offsets5); ArrayFromVector(values_is_valid, values_values, &values); @@ -254,6 +258,28 @@ class TestListArray : public TestBuilder { // Offsets not the right type ASSERT_RAISES(TypeError, ArrayType::FromArrays(*values, *offsets1, pool_)); + + // Null final offset + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Last list offset should be non-null"), + ArrayType::FromArrays(*offsets5, *values, pool_)); + + // ARROW-12077: check for off-by-one in construction (need mimalloc/ASan/Valgrind) + { + std::shared_ptr offsets, values; + // Length multiple of 8 - we'll allocate a validity buffer with exactly enough bits + // (Need a large enough buffer or else ASan doesn't catch it) + std::vector offsets_is_valid(4096); + std::vector offset_values(4096); + std::vector values_values(4096); + std::fill(offsets_is_valid.begin(), offsets_is_valid.end(), true); + offsets_is_valid[1] = false; + std::fill(offset_values.begin(), offset_values.end(), 0); + std::fill(values_values.begin(), values_values.end(), 0); + ArrayFromVector(offsets_is_valid, offset_values, &offsets); + ArrayFromVector(values_values, &values); + ASSERT_OK_AND_ASSIGN(auto list, ArrayType::FromArrays(*offsets, *values, pool_)); + } } void TestAppendNull() { diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 97bbb18696c..f967127c5f1 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -70,12 +70,11 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool, ARROW_ASSIGN_OR_RAISE(auto clean_offsets, AllocateBuffer(num_offsets * sizeof(offset_type), pool)); - // Copy valid bits, zero out the bit for the final offset - // XXX why? + // Copy valid bits, ignoring the final offset (since for a length N list array, + // we have N + 1 offsets) ARROW_ASSIGN_OR_RAISE( auto clean_valid_bits, offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1))); - BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets); *validity_buf_out = clean_valid_bits; const offset_type* raw_offsets = typed_offsets.raw_values(); diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index f9ac60f6cb9..b601eb770c3 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -64,9 +64,9 @@ class NumericArray : public PrimitiveArray { // For API compatibility with BinaryArray etc. value_type GetView(int64_t i) const { return Value(i); } - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: using PrimitiveArray::PrimitiveArray; @@ -99,9 +99,9 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { /// values. Result is not cached. int64_t true_count() const; - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: using PrimitiveArray::PrimitiveArray; diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index 1eb722b13c5..88d25e823bb 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -152,7 +152,8 @@ class TestUnionArrayFactories : public ::testing::Test { TEST_F(TestUnionArrayFactories, TestMakeDense) { std::shared_ptr value_offsets; - ArrayFromVector({1, 0, 0, 0, 1, 0, 1, 2, 1, 2}, &value_offsets); + // type_ids_: {0, 1, 2, 0, 1, 3, 2, 0, 2, 1} + ArrayFromVector({0, 0, 0, 1, 1, 0, 1, 2, 1, 2}, &value_offsets); auto children = std::vector>(4); ArrayFromVector({"abc", "def", "xyz"}, &children[0]); @@ -208,12 +209,19 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) { ASSERT_RAISES(Invalid, result->ValidateFull()); // Invalid offsets + // - offset out of bounds at index 5 std::shared_ptr invalid_offsets; - ArrayFromVector({1, 0, 0, 0, 1, 1, 1, 2, 1, 2}, &invalid_offsets); + ArrayFromVector({0, 0, 0, 1, 1, 1, 1, 2, 1, 2}, &invalid_offsets); ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); ASSERT_RAISES(Invalid, result->ValidateFull()); - ArrayFromVector({1, 0, 0, 0, 1, -1, 1, 2, 1, 2}, &invalid_offsets); + // - negative offset at index 5 + ArrayFromVector({0, 0, 0, 1, 1, -1, 1, 2, 1, 2}, &invalid_offsets); + ASSERT_OK_AND_ASSIGN(result, + DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); + ASSERT_RAISES(Invalid, result->ValidateFull()); + // - non-monotonic offset at index 3 + ArrayFromVector({1, 0, 0, 0, 1, 0, 1, 2, 1, 2}, &invalid_offsets); ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); ASSERT_RAISES(Invalid, result->ValidateFull()); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 2b8665fe2e6..6ac885f8443 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -527,6 +527,7 @@ struct ValidateArrayFullImpl { } // Check offsets are in bounds + std::vector last_child_offsets(256, 0); const int32_t* offsets = data.GetValues(2); for (int64_t i = 0; i < data.length; ++i) { const int32_t code = type_codes[i]; @@ -541,6 +542,11 @@ struct ValidateArrayFullImpl { "than child length (", offset, " >= ", child_lengths[code], ")"); } + if (offset < last_child_offsets[code]) { + return Status::Invalid("Union value at position ", i, + " has non-monotonic offset ", offset); + } + last_child_offsets[code] = offset; } } diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 41a47c91729..f525ec23c58 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -162,6 +162,12 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + void Reset() { buffer_ = NULLPTR; capacity_ = size_ = 0; @@ -202,6 +208,11 @@ class TypedBufferBuilder< MemoryPool* pool = default_memory_pool()) : bytes_builder_(std::move(buffer), pool) {} + explicit TypedBufferBuilder(BufferBuilder builder) + : bytes_builder_(std::move(builder)) {} + + BufferBuilder* bytes_builder() { return &bytes_builder_; } + Status Append(T value) { return bytes_builder_.Append(reinterpret_cast(&value), sizeof(T)); } @@ -256,6 +267,12 @@ class TypedBufferBuilder< return bytes_builder_.Finish(out, shrink_to_fit); } + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + void Reset() { bytes_builder_.Reset(); } int64_t length() const { return bytes_builder_.length() / sizeof(T); } @@ -274,6 +291,11 @@ class TypedBufferBuilder { explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool()) : bytes_builder_(pool) {} + explicit TypedBufferBuilder(BufferBuilder builder) + : bytes_builder_(std::move(builder)) {} + + BufferBuilder* bytes_builder() { return &bytes_builder_; } + Status Append(bool value) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(value); @@ -371,6 +393,12 @@ class TypedBufferBuilder { return bytes_builder_.Finish(out, shrink_to_fit); } + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + void Reset() { bytes_builder_.Reset(); bit_length_ = false_count_ = 0; diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 387105de9e7..6769b23867b 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -71,7 +71,7 @@ class EqualOptions { return res; } - static EqualOptions Defaults() { return EqualOptions(); } + static EqualOptions Defaults() { return {}; } protected: double atol_ = kDefaultAbsoluteTolerance; diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index eef1587bb73..ca118ec5678 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -306,5 +306,102 @@ Result TDigest(const Datum& value, const TDigestOptions& options = TDigestOptions::Defaults(), ExecContext* ctx = NULLPTR); +namespace internal { + +/// Internal use only: streaming group identifier. +/// Consumes batches of keys and yields batches of the group ids. +class ARROW_EXPORT Grouper { + public: + virtual ~Grouper() = default; + + /// Construct a Grouper which receives the specified key types + static Result> Make(const std::vector& descrs, + ExecContext* ctx = default_exec_context()); + + /// Consume a batch of keys, producing the corresponding group ids as an integer array. + /// Currently only uint32 indices will be produced, eventually the bit width will only + /// be as wide as necessary. + virtual Result Consume(const ExecBatch& batch) = 0; + + /// Get current unique keys. May be called multiple times. + virtual Result GetUniques() = 0; + + /// Get the current number of groups. + virtual uint32_t num_groups() const = 0; + + /// \brief Assemble lists of indices of identical elements. + /// + /// \param[in] ids An unsigned, all-valid integral array which will be + /// used as grouping criteria. + /// \param[in] num_groups An upper bound for the elements of ids + /// \return A num_groups-long ListArray where the slot at i contains a + /// list of indices where i appears in ids. + /// + /// MakeGroupings([ + /// 2, + /// 2, + /// 5, + /// 5, + /// 2, + /// 3 + /// ], 8) == [ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ] + static Result> MakeGroupings( + const UInt32Array& ids, uint32_t num_groups, + ExecContext* ctx = default_exec_context()); + + /// \brief Produce a ListArray whose slots are selections of `array` which correspond to + /// the provided groupings. + /// + /// For example, + /// ApplyGroupings([ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ], [2, 2, 5, 5, 2, 3]) == [ + /// [], + /// [], + /// [2, 2, 2], + /// [3], + /// [], + /// [5, 5], + /// [], + /// [] + /// ] + static Result> ApplyGroupings( + const ListArray& groupings, const Array& array, + ExecContext* ctx = default_exec_context()); +}; + +/// \brief Configure a grouped aggregation +struct ARROW_EXPORT Aggregate { + /// the name of the aggregation function + std::string function; + + /// options for the aggregation function + const FunctionOptions* options; +}; + +/// Internal use only: helper function for testing HashAggregateKernels. +/// This will be replaced by streaming execution operators. +ARROW_EXPORT +Result GroupBy(const std::vector& arguments, const std::vector& keys, + const std::vector& aggregates, + ExecContext* ctx = default_exec_context()); + +} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 0d95092c95b..730836bd118 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -68,6 +68,19 @@ struct ARROW_EXPORT SplitPatternOptions : public SplitOptions { std::string pattern; }; +struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions { + explicit ReplaceSubstringOptions(std::string pattern, std::string replacement, + int64_t max_replacements = -1) + : pattern(pattern), replacement(replacement), max_replacements(max_replacements) {} + + /// Pattern to match, literal, or regular expression depending on which kernel is used + std::string pattern; + /// String to replace the pattern with + std::string replacement; + /// Max number of substrings to replace (-1 means unbounded) + int64_t max_replacements; +}; + /// Options for IsIn and IndexIn functions struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { explicit SetLookupOptions(Datum value_set, bool skip_nulls = false) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 6443c96e918..c3187a3995a 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -36,6 +36,7 @@ #include "arrow/compute/registry.h" #include "arrow/compute/util_internal.h" #include "arrow/datum.h" +#include "arrow/record_batch.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" @@ -57,6 +58,44 @@ using internal::CpuInfo; namespace compute { +ExecContext* default_exec_context() { + static ExecContext default_ctx; + return &default_ctx; +} + +ExecBatch::ExecBatch(const RecordBatch& batch) + : values(batch.num_columns()), length(batch.num_rows()) { + auto columns = batch.column_data(); + std::move(columns.begin(), columns.end(), values.begin()); +} + +Result ExecBatch::Make(std::vector values) { + if (values.empty()) { + return Status::Invalid("Cannot infer ExecBatch length without at least one value"); + } + + int64_t length = -1; + for (const auto& value : values) { + if (value.is_scalar()) { + if (length == -1) { + length = 1; + } + continue; + } + + if (length == -1) { + length = value.length(); + continue; + } + + if (length != value.length()) { + return Status::Invalid( + "Arrays used to construct an ExecBatch must have equal length"); + } + } + + return ExecBatch(std::move(values), length); +} namespace { Result> AllocateDataBuffer(KernelContext* ctx, int64_t length, @@ -838,6 +877,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { private: Status Consume(const ExecBatch& batch) { + // FIXME(ARROW-11840) don't merge *any* aggegates for every batch auto batch_state = kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}); ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); @@ -855,6 +895,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { kernel_->merge(kernel_ctx_, std::move(*batch_state), state()); ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); + return Status::OK(); } diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index f491489ed8a..7659442d8bf 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -119,6 +119,8 @@ class ARROW_EXPORT ExecContext { bool use_threads_ = true; }; +ARROW_EXPORT ExecContext* default_exec_context(); + // TODO: Consider standardizing on uint16 selection vectors and only use them // when we can ensure that each value is 64K length or smaller @@ -164,11 +166,15 @@ class ARROW_EXPORT SelectionVector { /// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight /// than is desirable for this class. Microbenchmarks would help determine for /// sure. See ARROW-8928. -struct ExecBatch { +struct ARROW_EXPORT ExecBatch { ExecBatch() = default; ExecBatch(std::vector values, int64_t length) : values(std::move(values)), length(length) {} + explicit ExecBatch(const RecordBatch& batch); + + static Result Make(std::vector values); + /// The values representing positional arguments to be passed to a kernel's /// exec function for processing. std::vector values; diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h index a74e5c8d8fa..55daa243cd3 100644 --- a/cpp/src/arrow/compute/exec_internal.h +++ b/cpp/src/arrow/compute/exec_internal.h @@ -106,6 +106,11 @@ class ARROW_EXPORT KernelExecutor { public: virtual ~KernelExecutor() = default; + /// The Kernel's `init` method must be called and any KernelState set in the + /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate + /// the case where init may be expensive and does not need to be called again for + /// each execution of the kernel, for example the same lookup table can be re-used + /// for all scanned batches in a dataset filter. virtual Status Init(KernelContext*, KernelInitArgs) = 0; /// XXX: Better configurability for listener diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index 70d7d998e9c..c8fc8b8dec0 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -126,6 +126,11 @@ const Kernel* DispatchExactImpl(const Function* func, checked_cast(func)->kernels(), values); } + if (func->kind() == Function::HASH_AGGREGATE) { + return DispatchExactImpl(checked_cast(func)->kernels(), + values); + } + return nullptr; } @@ -184,8 +189,10 @@ Result Function::Execute(const std::vector& args, executor = detail::KernelExecutor::MakeScalar(); } else if (kind() == Function::VECTOR) { executor = detail::KernelExecutor::MakeVector(); - } else { + } else if (kind() == Function::SCALAR_AGGREGATE) { executor = detail::KernelExecutor::MakeScalarAggregate(); + } else { + return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions"); } RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options})); @@ -263,6 +270,15 @@ Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) { return Status::OK(); } +Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) { + RETURN_NOT_OK(CheckArity(kernel.signature->in_types())); + if (arity_.is_varargs && !kernel.signature->is_varargs()) { + return Status::Invalid("Function accepts varargs but kernel signature does not"); + } + kernels_.emplace_back(std::move(kernel)); + return Status::OK(); +} + Result MetaFunction::Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const { diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index af5d81a30ec..9a3e1c1852f 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -133,6 +133,10 @@ class ARROW_EXPORT Function { /// A function that computes scalar summary statistics from array input. SCALAR_AGGREGATE, + /// A function that computes grouped summary statistics from array input + /// and an array of group identifiers. + HASH_AGGREGATE, + /// A function that dispatches to other functions and does not contain its /// own kernels. META @@ -307,6 +311,21 @@ class ARROW_EXPORT ScalarAggregateFunction Status AddKernel(ScalarAggregateKernel kernel); }; +class ARROW_EXPORT HashAggregateFunction + : public detail::FunctionImpl { + public: + using KernelType = HashAggregateKernel; + + HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc, + const FunctionOptions* default_options = NULLPTR) + : detail::FunctionImpl( + std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {} + + /// \brief Add a kernel (function implementation). Returns error if the + /// kernel's signature does not match the function's arity. + Status AddKernel(HashAggregateKernel kernel); +}; + /// \brief A function that dispatches to other functions. Must implement /// MetaFunction::ExecuteImpl. /// diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index c8f9cacfb34..b99b41170d2 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -537,7 +537,8 @@ struct Kernel { : signature(std::move(sig)), init(std::move(init)) {} Kernel(std::vector in_types, OutputType out_type, KernelInit init) - : Kernel(KernelSignature::Make(std::move(in_types), out_type), std::move(init)) {} + : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init)) {} /// \brief The "signature" of the kernel containing the InputType input /// argument validators and OutputType output type and shape resolver. @@ -574,7 +575,8 @@ struct ArrayKernel : public Kernel { ArrayKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR) - : Kernel(std::move(in_types), std::move(out_type), init), exec(std::move(exec)) {} + : Kernel(std::move(in_types), std::move(out_type), std::move(init)), + exec(std::move(exec)) {} /// \brief Perform a single invocation of this kernel. Depending on the /// implementation, it may only write into preallocated memory, while in some @@ -617,7 +619,7 @@ struct VectorKernel : public ArrayKernel { VectorKernel() = default; VectorKernel(std::shared_ptr sig, ArrayKernelExec exec) - : ArrayKernel(std::move(sig), exec) {} + : ArrayKernel(std::move(sig), std::move(exec)) {} VectorKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) @@ -680,12 +682,12 @@ using ScalarAggregateFinalize = std::function; /// * finalize: produces the end result of the aggregation using the /// KernelState in the KernelContext. struct ScalarAggregateKernel : public Kernel { - ScalarAggregateKernel() {} + ScalarAggregateKernel() = default; ScalarAggregateKernel(std::shared_ptr sig, KernelInit init, ScalarAggregateConsume consume, ScalarAggregateMerge merge, ScalarAggregateFinalize finalize) - : Kernel(std::move(sig), init), + : Kernel(std::move(sig), std::move(init)), consume(std::move(consume)), merge(std::move(merge)), finalize(std::move(finalize)) {} @@ -693,13 +695,59 @@ struct ScalarAggregateKernel : public Kernel { ScalarAggregateKernel(std::vector in_types, OutputType out_type, KernelInit init, ScalarAggregateConsume consume, ScalarAggregateMerge merge, ScalarAggregateFinalize finalize) - : ScalarAggregateKernel(KernelSignature::Make(std::move(in_types), out_type), init, - consume, merge, finalize) {} + : ScalarAggregateKernel( + KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {} ScalarAggregateConsume consume; ScalarAggregateMerge merge; ScalarAggregateFinalize finalize; }; +// ---------------------------------------------------------------------- +// HashAggregateKernel (for HashAggregateFunction) + +using HashAggregateConsume = std::function; + +using HashAggregateMerge = + std::function; + +// Finalize returns Datum to permit multiple return values +using HashAggregateFinalize = std::function; + +/// \brief Kernel data structure for implementations of +/// HashAggregateFunction. The four necessary components of an aggregation +/// kernel are the init, consume, merge, and finalize functions. +/// +/// * init: creates a new KernelState for a kernel. +/// * consume: processes an ExecBatch (which includes the argument as well +/// as an array of group identifiers) and updates the KernelState found in the +/// KernelContext. +/// * merge: combines one KernelState with another. +/// * finalize: produces the end result of the aggregation using the +/// KernelState in the KernelContext. +struct HashAggregateKernel : public Kernel { + HashAggregateKernel() = default; + + HashAggregateKernel(std::shared_ptr sig, KernelInit init, + HashAggregateConsume consume, HashAggregateMerge merge, + HashAggregateFinalize finalize) + : Kernel(std::move(sig), std::move(init)), + consume(std::move(consume)), + merge(std::move(merge)), + finalize(std::move(finalize)) {} + + HashAggregateKernel(std::vector in_types, OutputType out_type, + KernelInit init, HashAggregateMerge merge, + HashAggregateConsume consume, HashAggregateFinalize finalize) + : HashAggregateKernel( + KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {} + + HashAggregateConsume consume; + HashAggregateMerge merge; + HashAggregateFinalize finalize; +}; + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 577b250da87..5e223a1f906 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -59,5 +59,9 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute") # Aggregates -add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc) +add_arrow_compute_test(aggregate_test + SOURCES + aggregate_test.cc + hash_aggregate_test.cc + test_util.cc) add_arrow_benchmark(aggregate_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 5cdd3bd1dd1..61dc8cb403c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -250,15 +250,13 @@ const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numer {"array"}, "MinMaxOptions"}; -const FunctionDoc any_doc{ - "Test whether any element in a boolean array evaluates to true.", - ("Null values are ignored."), - {"array"}}; - -const FunctionDoc all_doc{ - "Test whether all elements in a boolean array evaluate to true.", - ("Null values are ignored."), - {"array"}}; +const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true", + ("Null values are ignored."), + {"array"}}; + +const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true", + ("Null values are ignored."), + {"array"}}; } // namespace diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index c90dd03c06e..42be0c36544 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -300,6 +300,169 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) ->Apply(BenchmarkSetArgs); #endif // ARROW_WITH_BENCHMARKS_REFERENCE +// +// GroupBy +// + +static void BenchmarkGroupBy(benchmark::State& state, + std::vector aggregates, + std::vector arguments, std::vector keys) { + for (auto _ : state) { + ABORT_NOT_OK(GroupBy(arguments, keys, aggregates).status()); + } +} + +#define GROUP_BY_BENCHMARK(Name, Impl) \ + static void Name(benchmark::State& state) { \ + RegressionArgs args(state, false); \ + auto rng = random::RandomArrayGenerator(1923); \ + (Impl)(); \ + } \ + BENCHMARK(Name)->Apply([](benchmark::internal::Benchmark* bench) { \ + BenchmarkSetArgsWithSizes(bench, {1 * 1024 * 1024}); \ + }) + +GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyStringSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.StringWithRepeats(args.size, + /*unique=*/16, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallStringSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.StringWithRepeats(args.size, + /*unique=*/256, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumStringSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.StringWithRepeats(args.size, + /*unique=*/4096, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntegerSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.Int64(args.size, + /*min=*/0, + /*max=*/15); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntegerSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.Int64(args.size, + /*min=*/0, + /*max=*/255); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntegerSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.Int64(args.size, + /*min=*/0, + /*max=*/4095); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntStringPairSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto int_key = rng.Int64(args.size, + /*min=*/0, + /*max=*/4); + auto str_key = rng.StringWithRepeats(args.size, + /*unique=*/4, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntStringPairSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto int_key = rng.Int64(args.size, + /*min=*/0, + /*max=*/15); + auto str_key = rng.StringWithRepeats(args.size, + /*unique=*/16, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto int_key = rng.Int64(args.size, + /*min=*/0, + /*max=*/63); + auto str_key = rng.StringWithRepeats(args.size, + /*unique=*/64, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key}); +}); + // // Sum // diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 569886a1351..22e7f512e97 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -27,19 +27,26 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" #include "arrow/compute/api_aggregate.h" +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/registry.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_internal.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/util/logging.h" namespace arrow { +using internal::BitmapReader; using internal::checked_cast; using internal::checked_pointer_cast; @@ -65,8 +72,7 @@ static SumResult NaiveSumPartial(const Array& array) { const auto values = array_numeric.raw_values(); if (array.null_count() != 0) { - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), - array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); i++) { if (reader.IsSet()) { result.first += values[i]; @@ -488,9 +494,7 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test { void AssertMinMaxIsNull(const Datum& array, const MinMaxOptions& options) { ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options)); - - const StructScalar& value = out.scalar_as(); - for (const auto& val : value.value) { + for (const auto& val : out.scalar_as().value) { ASSERT_FALSE(val->is_valid); } } @@ -646,8 +650,7 @@ static enable_if_integer> NaiveMinMax( T min = std::numeric_limits::max(); T max = std::numeric_limits::min(); if (array.null_count() != 0) { // Some values are null - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), - array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); i++) { if (reader.IsSet()) { min = std::min(min, values[i]); @@ -686,8 +689,7 @@ static enable_if_floating_point> NaiveMinMax( T min = std::numeric_limits::infinity(); T max = -std::numeric_limits::infinity(); if (array.null_count() != 0) { // Some values are null - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), - array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); i++) { if (reader.IsSet()) { min = std::fmin(min, values[i]); @@ -1030,7 +1032,7 @@ ModeResult NaiveMode(const Array& array) { const auto& array_numeric = reinterpret_cast(array); const auto values = array_numeric.raw_values(); - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); ++i) { if (reader.IsSet()) { ++value_counts[values[i]]; @@ -1281,7 +1283,7 @@ void KahanSum(double& sum, double& adjust, double addend) { template std::pair WelfordVar(const ArrayType& array) { const auto values = array.raw_values(); - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); double count = 0, mean = 0, m2 = 0; double mean_adjust = 0, m2_adjust = 0; for (int64_t i = 0; i < array.length(); ++i) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 9e2ed82a426..b5d6c3807f1 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -328,6 +328,13 @@ struct UnboxScalar { } }; +template <> +struct UnboxScalar { + static Decimal256 Unbox(const Scalar& val) { + return checked_cast(val).value; + } +}; + template struct BoxScalar; @@ -354,6 +361,13 @@ struct BoxScalar { static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } }; +template <> +struct BoxScalar { + using T = Decimal256; + using ScalarType = Decimal256Scalar; + static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } +}; + // A VisitArrayDataInline variant that calls its visitor function with logical // values, such as Decimal128 rather than util::string_view. @@ -675,12 +689,13 @@ struct ScalarUnaryNotNullStateful { }; template - struct ArrayExec::value>> { + struct ArrayExec> { static void Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, Datum* out) { ArrayData* out_arr = out->mutable_array(); // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian - using endian_agnostic = std::array; + using endian_agnostic = + std::array::ScalarType::ValueType)>; auto out_data = out_arr->GetMutableValues(1); VisitArrayValuesInline( arg0, diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc new file mode 100644 index 00000000000..d9750cb4760 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -0,0 +1,1057 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/api_aggregate.h" + +#include +#include +#include +#include +#include + +#include "arrow/buffer_builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/exec_internal.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/make_unique.h" +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { +namespace internal { +namespace { + +struct KeyEncoder { + // the first byte of an encoded key is used to indicate nullity + static constexpr bool kExtraByteForNull = true; + + static constexpr uint8_t kNullByte = 1; + static constexpr uint8_t kValidByte = 0; + + virtual ~KeyEncoder() = default; + + virtual void AddLength(const ArrayData&, int32_t* lengths) = 0; + + virtual Status Encode(const ArrayData&, uint8_t** encoded_bytes) = 0; + + virtual Result> Decode(uint8_t** encoded_bytes, + int32_t length, MemoryPool*) = 0; + + // extract the null bitmap from the leading nullity bytes of encoded keys + static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes, + std::shared_ptr* null_bitmap, int32_t* null_count) { + // first count nulls to determine if a null bitmap is necessary + *null_count = 0; + for (int32_t i = 0; i < length; ++i) { + *null_count += (encoded_bytes[i][0] == kNullByte); + } + + if (*null_count > 0) { + ARROW_ASSIGN_OR_RAISE(*null_bitmap, AllocateBitmap(length, pool)); + + uint8_t* validity = (*null_bitmap)->mutable_data(); + for (int32_t i = 0; i < length; ++i) { + BitUtil::SetBitTo(validity, i, encoded_bytes[i][0] == kValidByte); + encoded_bytes[i] += 1; + } + } else { + for (int32_t i = 0; i < length; ++i) { + encoded_bytes[i] += 1; + } + } + return Status ::OK(); + } +}; + +struct BooleanKeyEncoder : KeyEncoder { + static constexpr int kByteWidth = 1; + + void AddLength(const ArrayData& data, int32_t* lengths) override { + for (int64_t i = 0; i < data.length; ++i) { + lengths[i] += kByteWidth + kExtraByteForNull; + } + } + + Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override { + VisitArrayDataInline( + data, + [&](bool value) { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kValidByte; + *encoded_ptr++ = value; + }, + [&] { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kNullByte; + *encoded_ptr++ = 0; + }); + return Status::OK(); + } + + Result> Decode(uint8_t** encoded_bytes, int32_t length, + MemoryPool* pool) override { + std::shared_ptr null_buf; + int32_t null_count; + RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count)); + + ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBitmap(length, pool)); + + uint8_t* raw_output = key_buf->mutable_data(); + for (int32_t i = 0; i < length; ++i) { + auto& encoded_ptr = encoded_bytes[i]; + BitUtil::SetBitTo(raw_output, i, encoded_ptr[0] != 0); + encoded_ptr += 1; + } + + return ArrayData::Make(boolean(), length, {std::move(null_buf), std::move(key_buf)}, + null_count); + } +}; + +struct FixedWidthKeyEncoder : KeyEncoder { + explicit FixedWidthKeyEncoder(std::shared_ptr type) + : type_(std::move(type)), + byte_width_(checked_cast(*type_).bit_width() / 8) {} + + void AddLength(const ArrayData& data, int32_t* lengths) override { + for (int64_t i = 0; i < data.length; ++i) { + lengths[i] += byte_width_ + kExtraByteForNull; + } + } + + Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override { + ArrayData viewed(fixed_size_binary(byte_width_), data.length, data.buffers, + data.null_count, data.offset); + + VisitArrayDataInline( + viewed, + [&](util::string_view bytes) { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kValidByte; + memcpy(encoded_ptr, bytes.data(), byte_width_); + encoded_ptr += byte_width_; + }, + [&] { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kNullByte; + memset(encoded_ptr, 0, byte_width_); + encoded_ptr += byte_width_; + }); + return Status::OK(); + } + + Result> Decode(uint8_t** encoded_bytes, int32_t length, + MemoryPool* pool) override { + std::shared_ptr null_buf; + int32_t null_count; + RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count)); + + ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length * byte_width_, pool)); + + uint8_t* raw_output = key_buf->mutable_data(); + for (int32_t i = 0; i < length; ++i) { + auto& encoded_ptr = encoded_bytes[i]; + std::memcpy(raw_output, encoded_ptr, byte_width_); + encoded_ptr += byte_width_; + raw_output += byte_width_; + } + + return ArrayData::Make(type_, length, {std::move(null_buf), std::move(key_buf)}, + null_count); + } + + std::shared_ptr type_; + int byte_width_; +}; + +struct DictionaryKeyEncoder : FixedWidthKeyEncoder { + DictionaryKeyEncoder(std::shared_ptr type, MemoryPool* pool) + : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {} + + Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override { + auto dict = MakeArray(data.dictionary); + if (dictionary_) { + if (!dictionary_->Equals(dict)) { + // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary + // differs from the first we saw for this key + return Status::NotImplemented("Unifying differing dictionaries"); + } + } else { + dictionary_ = std::move(dict); + } + return FixedWidthKeyEncoder::Encode(data, encoded_bytes); + } + + Result> Decode(uint8_t** encoded_bytes, int32_t length, + MemoryPool* pool) override { + ARROW_ASSIGN_OR_RAISE(auto data, + FixedWidthKeyEncoder::Decode(encoded_bytes, length, pool)); + + if (dictionary_) { + data->dictionary = dictionary_->data(); + } else { + ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0)); + data->dictionary = dict->data(); + } + + data->type = type_; + return data; + } + + MemoryPool* pool_; + std::shared_ptr dictionary_; +}; + +template +struct VarLengthKeyEncoder : KeyEncoder { + using Offset = typename T::offset_type; + + void AddLength(const ArrayData& data, int32_t* lengths) override { + int64_t i = 0; + VisitArrayDataInline( + data, + [&](util::string_view bytes) { + lengths[i++] += + kExtraByteForNull + sizeof(Offset) + static_cast(bytes.size()); + }, + [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); }); + } + + Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override { + VisitArrayDataInline( + data, + [&](util::string_view bytes) { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kValidByte; + util::SafeStore(encoded_ptr, static_cast(bytes.size())); + encoded_ptr += sizeof(Offset); + memcpy(encoded_ptr, bytes.data(), bytes.size()); + encoded_ptr += bytes.size(); + }, + [&] { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kNullByte; + util::SafeStore(encoded_ptr, static_cast(0)); + encoded_ptr += sizeof(Offset); + }); + return Status::OK(); + } + + Result> Decode(uint8_t** encoded_bytes, int32_t length, + MemoryPool* pool) override { + std::shared_ptr null_buf; + int32_t null_count; + RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count)); + + Offset length_sum = 0; + for (int32_t i = 0; i < length; ++i) { + length_sum += util::SafeLoadAs(encoded_bytes[i]); + } + + ARROW_ASSIGN_OR_RAISE(auto offset_buf, + AllocateBuffer(sizeof(Offset) * (1 + length), pool)); + ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length_sum)); + + auto raw_offsets = reinterpret_cast(offset_buf->mutable_data()); + auto raw_keys = key_buf->mutable_data(); + + Offset current_offset = 0; + for (int32_t i = 0; i < length; ++i) { + raw_offsets[i] = current_offset; + + auto key_length = util::SafeLoadAs(encoded_bytes[i]); + encoded_bytes[i] += sizeof(Offset); + + memcpy(raw_keys + current_offset, encoded_bytes[i], key_length); + encoded_bytes[i] += key_length; + + current_offset += key_length; + } + raw_offsets[length] = current_offset; + + return ArrayData::Make( + type_, length, {std::move(null_buf), std::move(offset_buf), std::move(key_buf)}, + null_count); + } + + explicit VarLengthKeyEncoder(std::shared_ptr type) : type_(std::move(type)) {} + + std::shared_ptr type_; +}; + +struct GrouperImpl : Grouper { + static Result> Make(const std::vector& keys, + ExecContext* ctx) { + auto impl = ::arrow::internal::make_unique(); + + impl->encoders_.resize(keys.size()); + impl->ctx_ = ctx; + + for (size_t i = 0; i < keys.size(); ++i) { + const auto& key = keys[i].type; + + if (key->id() == Type::BOOL) { + impl->encoders_[i] = ::arrow::internal::make_unique(); + continue; + } + + if (key->id() == Type::DICTIONARY) { + impl->encoders_[i] = + ::arrow::internal::make_unique(key, ctx->memory_pool()); + continue; + } + + if (is_fixed_width(key->id())) { + impl->encoders_[i] = ::arrow::internal::make_unique(key); + continue; + } + + if (is_binary_like(key->id())) { + impl->encoders_[i] = + ::arrow::internal::make_unique>(key); + continue; + } + + if (is_large_binary_like(key->id())) { + impl->encoders_[i] = + ::arrow::internal::make_unique>(key); + continue; + } + + return Status::NotImplemented("Keys of type ", *key); + } + + return std::move(impl); + } + + Result Consume(const ExecBatch& batch) override { + std::vector offsets_batch(batch.length + 1); + for (int i = 0; i < batch.num_values(); ++i) { + encoders_[i]->AddLength(*batch[i].array(), offsets_batch.data()); + } + + int32_t total_length = 0; + for (int64_t i = 0; i < batch.length; ++i) { + auto total_length_before = total_length; + total_length += offsets_batch[i]; + offsets_batch[i] = total_length_before; + } + offsets_batch[batch.length] = total_length; + + std::vector key_bytes_batch(total_length); + std::vector key_buf_ptrs(batch.length); + for (int64_t i = 0; i < batch.length; ++i) { + key_buf_ptrs[i] = key_bytes_batch.data() + offsets_batch[i]; + } + + for (int i = 0; i < batch.num_values(); ++i) { + RETURN_NOT_OK(encoders_[i]->Encode(*batch[i].array(), key_buf_ptrs.data())); + } + + TypedBufferBuilder group_ids_batch(ctx_->memory_pool()); + RETURN_NOT_OK(group_ids_batch.Resize(batch.length)); + + for (int64_t i = 0; i < batch.length; ++i) { + int32_t key_length = offsets_batch[i + 1] - offsets_batch[i]; + std::string key( + reinterpret_cast(key_bytes_batch.data() + offsets_batch[i]), + key_length); + + auto it_success = map_.emplace(key, num_groups_); + auto group_id = it_success.first->second; + + if (it_success.second) { + // new key; update offsets and key_bytes + ++num_groups_; + auto next_key_offset = static_cast(key_bytes_.size()); + key_bytes_.resize(next_key_offset + key_length); + offsets_.push_back(next_key_offset + key_length); + memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length); + } + + group_ids_batch.UnsafeAppend(group_id); + } + + ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish()); + return Datum(UInt32Array(batch.length, std::move(group_ids))); + } + + uint32_t num_groups() const override { return num_groups_; } + + Result GetUniques() override { + ExecBatch out({}, num_groups_); + + std::vector key_buf_ptrs(num_groups_); + for (int64_t i = 0; i < num_groups_; ++i) { + key_buf_ptrs[i] = key_bytes_.data() + offsets_[i]; + } + + out.values.resize(encoders_.size()); + for (size_t i = 0; i < encoders_.size(); ++i) { + ARROW_ASSIGN_OR_RAISE( + out.values[i], + encoders_[i]->Decode(key_buf_ptrs.data(), static_cast(num_groups_), + ctx_->memory_pool())); + } + + return out; + } + + ExecContext* ctx_; + std::unordered_map map_; + std::vector offsets_ = {0}; + std::vector key_bytes_; + uint32_t num_groups_ = 0; + std::vector> encoders_; +}; + +/// C++ abstract base class for the HashAggregateKernel interface. +/// Implementations should be default constructible and perform initialization in +/// Init(). +struct GroupedAggregator : KernelState { + virtual Status Init(ExecContext*, const FunctionOptions*, + const std::shared_ptr&) = 0; + + virtual Status Consume(const ExecBatch& batch) = 0; + + virtual Result Finalize() = 0; + + template + Status MaybeReserve(int64_t old_num_groups, const ExecBatch& batch, + const Reserve& reserve) { + int64_t new_num_groups = batch[2].scalar_as().value; + if (new_num_groups <= old_num_groups) { + return Status::OK(); + } + return reserve(new_num_groups - old_num_groups); + } + + virtual std::shared_ptr out_type() const = 0; +}; + +// ---------------------------------------------------------------------- +// Count implementation + +struct GroupedCountImpl : public GroupedAggregator { + Status Init(ExecContext* ctx, const FunctionOptions* options, + const std::shared_ptr&) override { + options_ = checked_cast(*options); + counts_ = BufferBuilder(ctx->memory_pool()); + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { + RETURN_NOT_OK(MaybeReserve(counts_.length(), batch, [&](int64_t added_groups) { + num_groups_ += added_groups; + return counts_.Append(added_groups * sizeof(int64_t), 0); + })); + + auto group_ids = batch[1].array()->GetValues(1); + auto raw_counts = reinterpret_cast(counts_.mutable_data()); + + const auto& input = batch[0].array(); + + if (options_.count_mode == CountOptions::COUNT_NULL) { + for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) { + auto g = group_ids[i]; + raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i); + } + return Status::OK(); + } + + arrow::internal::VisitSetBitRunsVoid( + input->buffers[0], input->offset, input->length, + [&](int64_t begin, int64_t length) { + for (int64_t input_i = begin, i = begin - input->offset; + input_i < begin + length; ++input_i, ++i) { + auto g = group_ids[i]; + raw_counts[g] += 1; + } + }); + return Status::OK(); + } + + Result Finalize() override { + ARROW_ASSIGN_OR_RAISE(auto counts, counts_.Finish()); + return std::make_shared(num_groups_, std::move(counts)); + } + + std::shared_ptr out_type() const override { return int64(); } + + int64_t num_groups_ = 0; + CountOptions options_; + BufferBuilder counts_; +}; + +// ---------------------------------------------------------------------- +// Sum implementation + +struct GroupedSumImpl : public GroupedAggregator { + // NB: whether we are accumulating into double, int64_t, or uint64_t + // we always have 64 bits per group in the sums buffer. + static constexpr size_t kSumSize = sizeof(int64_t); + + using ConsumeImpl = std::function&, + const uint32_t*, void*, int64_t*)>; + + struct GetConsumeImpl { + template ::Type> + Status Visit(const T&) { + consume_impl = [](const std::shared_ptr& input, const uint32_t* group, + void* boxed_sums, int64_t* counts) { + auto sums = reinterpret_cast::CType*>(boxed_sums); + + VisitArrayDataInline( + *input, + [&](typename TypeTraits::CType value) { + sums[*group] += value; + counts[*group] += 1; + ++group; + }, + [&] { ++group; }); + }; + out_type = TypeTraits::type_singleton(); + return Status::OK(); + } + + Status Visit(const HalfFloatType& type) { + return Status::NotImplemented("Summing data of type ", type); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("Summing data of type ", type); + } + + ConsumeImpl consume_impl; + std::shared_ptr out_type; + }; + + Status Init(ExecContext* ctx, const FunctionOptions*, + const std::shared_ptr& input_type) override { + pool_ = ctx->memory_pool(); + sums_ = BufferBuilder(pool_); + counts_ = BufferBuilder(pool_); + + GetConsumeImpl get_consume_impl; + RETURN_NOT_OK(VisitTypeInline(*input_type, &get_consume_impl)); + + consume_impl_ = std::move(get_consume_impl.consume_impl); + out_type_ = std::move(get_consume_impl.out_type); + + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { + RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) { + num_groups_ += added_groups; + RETURN_NOT_OK(sums_.Append(added_groups * kSumSize, 0)); + RETURN_NOT_OK(counts_.Append(added_groups * sizeof(int64_t), 0)); + return Status::OK(); + })); + + auto group_ids = batch[1].array()->GetValues(1); + consume_impl_(batch[0].array(), group_ids, sums_.mutable_data(), + reinterpret_cast(counts_.mutable_data())); + return Status::OK(); + } + + Result Finalize() override { + std::shared_ptr null_bitmap; + int64_t null_count = 0; + + for (int64_t i = 0; i < num_groups_; ++i) { + if (reinterpret_cast(counts_.data())[i] > 0) continue; + + if (null_bitmap == nullptr) { + ARROW_ASSIGN_OR_RAISE(null_bitmap, AllocateBitmap(num_groups_, pool_)); + BitUtil::SetBitsTo(null_bitmap->mutable_data(), 0, num_groups_, true); + } + + null_count += 1; + BitUtil::SetBitTo(null_bitmap->mutable_data(), i, false); + } + + ARROW_ASSIGN_OR_RAISE(auto sums, sums_.Finish()); + + return ArrayData::Make(std::move(out_type_), num_groups_, + {std::move(null_bitmap), std::move(sums)}, null_count); + } + + std::shared_ptr out_type() const override { return out_type_; } + + // NB: counts are used here instead of a simple "has_values_" bitmap since + // we expect to reuse this kernel to handle Mean + int64_t num_groups_ = 0; + BufferBuilder sums_, counts_; + std::shared_ptr out_type_; + ConsumeImpl consume_impl_; + MemoryPool* pool_; +}; + +// ---------------------------------------------------------------------- +// MinMax implementation + +template +struct Extrema : std::numeric_limits {}; + +template <> +struct Extrema { + static constexpr float min() { return -std::numeric_limits::infinity(); } + static constexpr float max() { return std::numeric_limits::infinity(); } +}; + +template <> +struct Extrema { + static constexpr double min() { return -std::numeric_limits::infinity(); } + static constexpr double max() { return std::numeric_limits::infinity(); } +}; + +struct GroupedMinMaxImpl : public GroupedAggregator { + using ConsumeImpl = + std::function&, const uint32_t*, void*, void*, + uint8_t*, uint8_t*)>; + + using ResizeImpl = std::function; + + template + static ResizeImpl MakeResizeImpl(CType anti_extreme) { + // resize a min or max buffer, storing the correct anti extreme + return [anti_extreme](BufferBuilder* builder, int64_t added_groups) { + TypedBufferBuilder typed_builder(std::move(*builder)); + RETURN_NOT_OK(typed_builder.Append(added_groups, anti_extreme)); + *builder = std::move(*typed_builder.bytes_builder()); + return Status::OK(); + }; + } + + struct GetImpl { + template ::CType> + enable_if_number Visit(const T&) { + consume_impl = [](const std::shared_ptr& input, const uint32_t* group, + void* mins, void* maxes, uint8_t* has_values, + uint8_t* has_nulls) { + auto raw_mins = reinterpret_cast(mins); + auto raw_maxes = reinterpret_cast(maxes); + + VisitArrayDataInline( + *input, + [&](CType val) { + raw_maxes[*group] = std::max(raw_maxes[*group], val); + raw_mins[*group] = std::min(raw_mins[*group], val); + BitUtil::SetBit(has_values, *group++); + }, + [&] { BitUtil::SetBit(has_nulls, *group++); }); + }; + + resize_min_impl = MakeResizeImpl(Extrema::max()); + resize_max_impl = MakeResizeImpl(Extrema::min()); + return Status::OK(); + } + + Status Visit(const BooleanType& type) { + return Status::NotImplemented("Grouped MinMax data of type ", type); + } + + Status Visit(const HalfFloatType& type) { + return Status::NotImplemented("Grouped MinMax data of type ", type); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("Grouped MinMax data of type ", type); + } + + ConsumeImpl consume_impl; + ResizeImpl resize_min_impl, resize_max_impl; + }; + + Status Init(ExecContext* ctx, const FunctionOptions* options, + const std::shared_ptr& input_type) override { + options_ = *checked_cast(options); + type_ = input_type; + + mins_ = BufferBuilder(ctx->memory_pool()); + maxes_ = BufferBuilder(ctx->memory_pool()); + has_values_ = BufferBuilder(ctx->memory_pool()); + has_nulls_ = BufferBuilder(ctx->memory_pool()); + + GetImpl get_impl; + RETURN_NOT_OK(VisitTypeInline(*input_type, &get_impl)); + + consume_impl_ = std::move(get_impl.consume_impl); + resize_min_impl_ = std::move(get_impl.resize_min_impl); + resize_max_impl_ = std::move(get_impl.resize_max_impl); + resize_bitmap_impl_ = MakeResizeImpl(false); + + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { + RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) { + num_groups_ += added_groups; + RETURN_NOT_OK(resize_min_impl_(&mins_, added_groups)); + RETURN_NOT_OK(resize_max_impl_(&maxes_, added_groups)); + RETURN_NOT_OK(resize_bitmap_impl_(&has_values_, added_groups)); + RETURN_NOT_OK(resize_bitmap_impl_(&has_nulls_, added_groups)); + return Status::OK(); + })); + + auto group_ids = batch[1].array()->GetValues(1); + consume_impl_(batch[0].array(), group_ids, mins_.mutable_data(), + maxes_.mutable_data(), has_values_.mutable_data(), + has_nulls_.mutable_data()); + return Status::OK(); + } + + Result Finalize() override { + // aggregation for group is valid if there was at least one value in that group + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish()); + + if (options_.null_handling == MinMaxOptions::EMIT_NULL) { + // ... and there were no nulls in that group + ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish()); + arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0, + num_groups_, 0, null_bitmap->mutable_data()); + } + + auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr}); + auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr}); + ARROW_ASSIGN_OR_RAISE(mins->buffers[1], mins_.Finish()); + ARROW_ASSIGN_OR_RAISE(maxes->buffers[1], maxes_.Finish()); + + return ArrayData::Make(out_type(), num_groups_, {nullptr}, + {std::move(mins), std::move(maxes)}); + } + + std::shared_ptr out_type() const override { + return struct_({field("min", type_), field("max", type_)}); + } + + int64_t num_groups_; + BufferBuilder mins_, maxes_, has_values_, has_nulls_; + std::shared_ptr type_; + ConsumeImpl consume_impl_; + ResizeImpl resize_min_impl_, resize_max_impl_, resize_bitmap_impl_; + MinMaxOptions options_; +}; + +template +HashAggregateKernel MakeKernel(InputType argument_type) { + HashAggregateKernel kernel; + + kernel.init = [](KernelContext* ctx, + const KernelInitArgs& args) -> std::unique_ptr { + auto impl = ::arrow::internal::make_unique(); + // FIXME(bkietz) Init should not take a type. That should be an unboxed template arg + // for the Impl. Otherwise we're not exposing dispatch as well as we should. + ctx->SetStatus(impl->Init(ctx->exec_context(), args.options, args.inputs[0].type)); + if (ctx->HasError()) return nullptr; + return std::move(impl); + }; + + kernel.signature = KernelSignature::Make( + {std::move(argument_type), InputType::Array(Type::UINT32), + InputType::Scalar(Type::UINT32)}, + OutputType( + [](KernelContext* ctx, const std::vector&) -> Result { + return checked_cast(ctx->state())->out_type(); + })); + + kernel.consume = [](KernelContext* ctx, const ExecBatch& batch) { + ctx->SetStatus(checked_cast(ctx->state())->Consume(batch)); + }; + + kernel.merge = [](KernelContext* ctx, KernelState&&, KernelState*) { + // TODO(ARROW-11840) merge two hash tables + ctx->SetStatus(Status::NotImplemented("Merge hashed aggregations")); + }; + + kernel.finalize = [](KernelContext* ctx, Datum* out) { + KERNEL_ASSIGN_OR_RAISE(*out, ctx, + checked_cast(ctx->state())->Finalize()); + }; + + return kernel; +} + +Result> GetKernels( + ExecContext* ctx, const std::vector& aggregates, + const std::vector& in_descrs) { + if (aggregates.size() != in_descrs.size()) { + return Status::Invalid(aggregates.size(), " aggregate functions were specified but ", + in_descrs.size(), " arguments were provided."); + } + + std::vector kernels(in_descrs.size()); + + for (size_t i = 0; i < aggregates.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(auto function, + ctx->func_registry()->GetFunction(aggregates[i].function)); + ARROW_ASSIGN_OR_RAISE( + const Kernel* kernel, + function->DispatchExact( + {in_descrs[i], ValueDescr::Array(uint32()), ValueDescr::Scalar(uint32())})); + kernels[i] = static_cast(kernel); + } + return kernels; +} + +Result>> InitKernels( + const std::vector& kernels, ExecContext* ctx, + const std::vector& aggregates, const std::vector& in_descrs) { + std::vector> states(kernels.size()); + + for (size_t i = 0; i < aggregates.size(); ++i) { + auto options = aggregates[i].options; + + if (options == nullptr) { + // use known default options for the named function if possible + auto maybe_function = ctx->func_registry()->GetFunction(aggregates[i].function); + if (maybe_function.ok()) { + options = maybe_function.ValueOrDie()->default_options(); + } + } + + KernelContext kernel_ctx{ctx}; + states[i] = kernels[i]->init(&kernel_ctx, KernelInitArgs{kernels[i], + { + in_descrs[i].type, + uint32(), + uint32(), + }, + options}); + if (kernel_ctx.HasError()) return kernel_ctx.status(); + } + + return std::move(states); +} + +Result ResolveKernels( + const std::vector& aggregates, + const std::vector& kernels, + const std::vector>& states, ExecContext* ctx, + const std::vector& descrs) { + FieldVector fields(descrs.size()); + + for (size_t i = 0; i < kernels.size(); ++i) { + KernelContext kernel_ctx{ctx}; + kernel_ctx.SetState(states[i].get()); + + ARROW_ASSIGN_OR_RAISE(auto descr, kernels[i]->signature->out_type().Resolve( + &kernel_ctx, { + descrs[i].type, + uint32(), + uint32(), + })); + fields[i] = field(aggregates[i].function, std::move(descr.type)); + } + return fields; +} + +} // namespace + +Result> Grouper::Make(const std::vector& descrs, + ExecContext* ctx) { + return GrouperImpl::Make(descrs, ctx); +} + +Result GroupBy(const std::vector& arguments, const std::vector& keys, + const std::vector& aggregates, ExecContext* ctx) { + // Construct and initialize HashAggregateKernels + ARROW_ASSIGN_OR_RAISE(auto argument_descrs, + ExecBatch::Make(arguments).Map( + [](ExecBatch batch) { return batch.GetDescriptors(); })); + + ARROW_ASSIGN_OR_RAISE(auto kernels, GetKernels(ctx, aggregates, argument_descrs)); + + ARROW_ASSIGN_OR_RAISE(auto states, + InitKernels(kernels, ctx, aggregates, argument_descrs)); + + ARROW_ASSIGN_OR_RAISE( + FieldVector out_fields, + ResolveKernels(aggregates, kernels, states, ctx, argument_descrs)); + + using arrow::compute::detail::ExecBatchIterator; + + ARROW_ASSIGN_OR_RAISE(auto argument_batch_iterator, + ExecBatchIterator::Make(arguments, ctx->exec_chunksize())); + + // Construct Grouper + ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) { + return batch.GetDescriptors(); + })); + + ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_descrs, ctx)); + + int i = 0; + for (ValueDescr& key_descr : key_descrs) { + out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type))); + } + + ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator, + ExecBatchIterator::Make(keys, ctx->exec_chunksize())); + + // start "streaming" execution + ExecBatch key_batch, argument_batch; + while (argument_batch_iterator->Next(&argument_batch) && + key_batch_iterator->Next(&key_batch)) { + if (key_batch.length == 0) continue; + + // compute a batch of group ids + ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch)); + + // consume group ids with HashAggregateKernels + for (size_t i = 0; i < kernels.size(); ++i) { + KernelContext batch_ctx{ctx}; + batch_ctx.SetState(states[i].get()); + ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make({argument_batch[i], id_batch, + Datum(grouper->num_groups())})); + kernels[i]->consume(&batch_ctx, batch); + if (batch_ctx.HasError()) return batch_ctx.status(); + } + } + + // Finalize output + ArrayDataVector out_data(arguments.size() + keys.size()); + auto it = out_data.begin(); + + for (size_t i = 0; i < kernels.size(); ++i) { + KernelContext batch_ctx{ctx}; + batch_ctx.SetState(states[i].get()); + Datum out; + kernels[i]->finalize(&batch_ctx, &out); + if (batch_ctx.HasError()) return batch_ctx.status(); + *it++ = out.array(); + } + + ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, grouper->GetUniques()); + for (const auto& key : out_keys.values) { + *it++ = key.array(); + } + + int64_t length = out_data[0]->length; + return ArrayData::Make(struct_(std::move(out_fields)), length, + {/*null_bitmap=*/nullptr}, std::move(out_data), + /*null_count=*/0); +} + +Result> Grouper::ApplyGroupings(const ListArray& groupings, + const Array& array, + ExecContext* ctx) { + ARROW_ASSIGN_OR_RAISE(Datum sorted, + compute::Take(array, groupings.data()->child_data[0], + TakeOptions::NoBoundsCheck(), ctx)); + + return std::make_shared(list(array.type()), groupings.length(), + groupings.value_offsets(), sorted.make_array()); +} + +Result> Grouper::MakeGroupings(const UInt32Array& ids, + uint32_t num_groups, + ExecContext* ctx) { + if (ids.null_count() != 0) { + return Status::Invalid("MakeGroupings with null ids"); + } + + ARROW_ASSIGN_OR_RAISE(auto offsets, AllocateBuffer(sizeof(int32_t) * (num_groups + 1), + ctx->memory_pool())); + auto raw_offsets = reinterpret_cast(offsets->mutable_data()); + + std::memset(raw_offsets, 0, offsets->size()); + for (int i = 0; i < ids.length(); ++i) { + DCHECK_LT(ids.Value(i), num_groups); + raw_offsets[ids.Value(i)] += 1; + } + int32_t length = 0; + for (uint32_t id = 0; id < num_groups; ++id) { + auto offset = raw_offsets[id]; + raw_offsets[id] = length; + length += offset; + } + raw_offsets[num_groups] = length; + DCHECK_EQ(ids.length(), length); + + ARROW_ASSIGN_OR_RAISE(auto offsets_copy, + offsets->CopySlice(0, offsets->size(), ctx->memory_pool())); + raw_offsets = reinterpret_cast(offsets_copy->mutable_data()); + + ARROW_ASSIGN_OR_RAISE(auto sort_indices, AllocateBuffer(sizeof(int32_t) * ids.length(), + ctx->memory_pool())); + auto raw_sort_indices = reinterpret_cast(sort_indices->mutable_data()); + for (int i = 0; i < ids.length(); ++i) { + raw_sort_indices[raw_offsets[ids.Value(i)]++] = i; + } + + return std::make_shared( + list(int32()), num_groups, std::move(offsets), + std::make_shared(ids.length(), std::move(sort_indices))); +} + +namespace { +const FunctionDoc hash_count_doc{"Count the number of null / non-null values", + ("By default, non-null values are counted.\n" + "This can be changed through CountOptions."), + {"array", "group_id_array", "group_count"}, + "CountOptions"}; + +const FunctionDoc hash_sum_doc{"Sum values of a numeric array", + ("Null values are ignored."), + {"array", "group_id_array", "group_count"}}; + +const FunctionDoc hash_min_max_doc{ + "Compute the minimum and maximum values of a numeric array", + ("Null values are ignored by default.\n" + "This can be changed through MinMaxOptions."), + {"array", "group_id_array", "group_count"}, + "MinMaxOptions"}; +} // namespace + +void RegisterHashAggregateBasic(FunctionRegistry* registry) { + { + static auto default_count_options = CountOptions::Defaults(); + auto func = std::make_shared( + "hash_count", Arity::Ternary(), &hash_count_doc, &default_count_options); + DCHECK_OK(func->AddKernel(MakeKernel(ValueDescr::ARRAY))); + DCHECK_OK(registry->AddFunction(std::move(func))); + } + + { + auto func = std::make_shared("hash_sum", Arity::Ternary(), + &hash_sum_doc); + DCHECK_OK(func->AddKernel(MakeKernel(ValueDescr::ARRAY))); + DCHECK_OK(registry->AddFunction(std::move(func))); + } + + { + static auto default_minmax_options = MinMaxOptions::Defaults(); + auto func = std::make_shared( + "hash_min_max", Arity::Ternary(), &hash_min_max_doc, &default_minmax_options); + DCHECK_OK(func->AddKernel(MakeKernel(ValueDescr::ARRAY))); + DCHECK_OK(registry->AddFunction(std::move(func))); + } +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc new file mode 100644 index 00000000000..7858d8bb147 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -0,0 +1,703 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/chunked_array.h" +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/registry.h" +#include "arrow/testing/generator.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bitmap_reader.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_internal.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" + +using testing::HasSubstr; + +namespace arrow { + +using internal::BitmapReader; +using internal::checked_cast; +using internal::checked_pointer_cast; + +namespace compute { +namespace { + +Result NaiveGroupBy(std::vector arguments, std::vector keys, + const std::vector& aggregates) { + ARROW_ASSIGN_OR_RAISE(auto key_batch, ExecBatch::Make(std::move(keys))); + + ARROW_ASSIGN_OR_RAISE(auto grouper, + internal::Grouper::Make(key_batch.GetDescriptors())); + + ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch)); + + ARROW_ASSIGN_OR_RAISE( + auto groupings, internal::Grouper::MakeGroupings(*id_batch.array_as(), + grouper->num_groups())); + + ArrayVector out_columns; + std::vector out_names; + + for (size_t i = 0; i < arguments.size(); ++i) { + out_names.push_back(aggregates[i].function); + + // trim "hash_" prefix + auto scalar_agg_function = aggregates[i].function.substr(5); + + ARROW_ASSIGN_OR_RAISE( + auto grouped_argument, + internal::Grouper::ApplyGroupings(*groupings, *arguments[i].make_array())); + + ScalarVector aggregated_scalars; + + for (int64_t i_group = 0; i_group < grouper->num_groups(); ++i_group) { + auto slice = grouped_argument->value_slice(i_group); + if (slice->length() == 0) continue; + ARROW_ASSIGN_OR_RAISE( + Datum d, CallFunction(scalar_agg_function, {slice}, aggregates[i].options)); + aggregated_scalars.push_back(d.scalar()); + } + + ARROW_ASSIGN_OR_RAISE(Datum aggregated_column, + ScalarVectorToArray(aggregated_scalars)); + out_columns.push_back(aggregated_column.make_array()); + } + + int i = 0; + ARROW_ASSIGN_OR_RAISE(auto uniques, grouper->GetUniques()); + for (const Datum& key : uniques.values) { + out_columns.push_back(key.make_array()); + out_names.push_back("key_" + std::to_string(i++)); + } + + return StructArray::Make(std::move(out_columns), std::move(out_names)); +} + +void ValidateGroupBy(const std::vector& aggregates, + std::vector arguments, std::vector keys) { + ASSERT_OK_AND_ASSIGN(Datum expected, NaiveGroupBy(arguments, keys, aggregates)); + + ASSERT_OK_AND_ASSIGN(Datum actual, GroupBy(arguments, keys, aggregates)); + + ASSERT_OK(expected.make_array()->ValidateFull()); + ASSERT_OK(actual.make_array()->ValidateFull()); + + AssertDatumsEqual(expected, actual, /*verbose=*/true); +} + +} // namespace + +TEST(Grouper, SupportedKeys) { + ASSERT_OK(internal::Grouper::Make({boolean()})); + + ASSERT_OK(internal::Grouper::Make({int8(), uint16(), int32(), uint64()})); + + ASSERT_OK(internal::Grouper::Make({dictionary(int64(), utf8())})); + + ASSERT_OK(internal::Grouper::Make({float16(), float32(), float64()})); + + ASSERT_OK(internal::Grouper::Make({utf8(), binary(), large_utf8(), large_binary()})); + + ASSERT_OK(internal::Grouper::Make({fixed_size_binary(16), fixed_size_binary(32)})); + + ASSERT_OK(internal::Grouper::Make({decimal128(32, 10), decimal256(76, 20)})); + + ASSERT_OK(internal::Grouper::Make({date32(), date64()})); + + for (auto unit : { + TimeUnit::SECOND, + TimeUnit::MILLI, + TimeUnit::MICRO, + TimeUnit::NANO, + }) { + ASSERT_OK(internal::Grouper::Make({timestamp(unit), duration(unit)})); + } + + ASSERT_OK(internal::Grouper::Make({day_time_interval(), month_interval()})); + + ASSERT_RAISES(NotImplemented, internal::Grouper::Make({struct_({field("", int64())})})); + + ASSERT_RAISES(NotImplemented, internal::Grouper::Make({struct_({})})); + + ASSERT_RAISES(NotImplemented, internal::Grouper::Make({list(int32())})); + + ASSERT_RAISES(NotImplemented, internal::Grouper::Make({fixed_size_list(int32(), 5)})); + + ASSERT_RAISES(NotImplemented, + internal::Grouper::Make({dense_union({field("", int32())})})); +} + +struct TestGrouper { + explicit TestGrouper(std::vector descrs) : descrs_(std::move(descrs)) { + grouper_ = internal::Grouper::Make(descrs_).ValueOrDie(); + + FieldVector fields; + for (const auto& descr : descrs_) { + fields.push_back(field("", descr.type)); + } + key_schema_ = schema(std::move(fields)); + } + + void ExpectConsume(const std::string& key_json, const std::string& expected) { + ExpectConsume(ExecBatch(*RecordBatchFromJSON(key_schema_, key_json)), + ArrayFromJSON(uint32(), expected)); + } + + void ExpectConsume(const std::vector& key_batch, Datum expected) { + ExpectConsume(*ExecBatch::Make(key_batch), expected); + } + + void ExpectConsume(const ExecBatch& key_batch, Datum expected) { + Datum ids; + ConsumeAndValidate(key_batch, &ids); + AssertDatumsEqual(expected, ids, /*verbose=*/true); + } + + void ConsumeAndValidate(const ExecBatch& key_batch, Datum* ids = nullptr) { + ASSERT_OK_AND_ASSIGN(Datum id_batch, grouper_->Consume(key_batch)); + + ValidateConsume(key_batch, id_batch); + + if (ids) { + *ids = std::move(id_batch); + } + } + + void ValidateConsume(const ExecBatch& key_batch, const Datum& id_batch) { + if (uniques_.length == -1) { + ASSERT_OK_AND_ASSIGN(uniques_, grouper_->GetUniques()); + } else if (static_cast(grouper_->num_groups()) > uniques_.length) { + ASSERT_OK_AND_ASSIGN(ExecBatch new_uniques, grouper_->GetUniques()); + + // check that uniques_ are prefixes of new_uniques + for (int i = 0; i < uniques_.num_values(); ++i) { + auto new_unique = new_uniques[i].make_array(); + ASSERT_OK(new_unique->ValidateFull()); + + AssertDatumsEqual(uniques_[i], new_unique->Slice(0, uniques_.length), + /*verbose=*/true); + } + + uniques_ = std::move(new_uniques); + } + + // check that the ids encode an equivalent key sequence + auto ids = id_batch.make_array(); + ASSERT_OK(ids->ValidateFull()); + + for (int i = 0; i < key_batch.num_values(); ++i) { + SCOPED_TRACE(std::to_string(i) + "th key array"); + auto original = key_batch[i].make_array(); + ASSERT_OK_AND_ASSIGN(auto encoded, Take(*uniques_[i].make_array(), *ids)); + AssertArraysEqual(*original, *encoded, /*verbose=*/true, + EqualOptions().nans_equal(true)); + } + } + + std::vector descrs_; + std::shared_ptr key_schema_; + std::unique_ptr grouper_; + ExecBatch uniques_ = ExecBatch({}, -1); +}; + +TEST(Grouper, BooleanKey) { + TestGrouper g({boolean()}); + + g.ExpectConsume("[[true], [true]]", "[0, 0]"); + + g.ExpectConsume("[[true], [true]]", "[0, 0]"); + + g.ExpectConsume("[[false], [null]]", "[1, 2]"); + + g.ExpectConsume("[[true], [false], [true], [false], [null], [false], [null]]", + "[0, 1, 0, 1, 2, 1, 2]"); +} + +TEST(Grouper, NumericKey) { + for (auto ty : { + uint8(), + int8(), + uint16(), + int16(), + uint32(), + int32(), + uint64(), + int64(), + float16(), + float32(), + float64(), + }) { + SCOPED_TRACE("key type: " + ty->ToString()); + + TestGrouper g({ty}); + + g.ExpectConsume("[[3], [3]]", "[0, 0]"); + + g.ExpectConsume("[[3], [3]]", "[0, 0]"); + + g.ExpectConsume("[[27], [81]]", "[1, 2]"); + + g.ExpectConsume("[[3], [27], [3], [27], [null], [81], [27], [81]]", + "[0, 1, 0, 1, 3, 2, 1, 2]"); + } +} + +TEST(Grouper, FloatingPointKey) { + TestGrouper g({float32()}); + + // -0.0 hashes differently from 0.0 + g.ExpectConsume("[[0.0], [-0.0]]", "[0, 1]"); + + g.ExpectConsume("[[Inf], [-Inf]]", "[2, 3]"); + + // assert(!(NaN == NaN)) does not cause spurious new groups + g.ExpectConsume("[[NaN], [NaN]]", "[4, 4]"); + + // TODO(bkietz) test denormal numbers, more NaNs +} + +TEST(Grouper, StringKey) { + for (auto ty : {utf8(), large_utf8(), fixed_size_binary(2)}) { + SCOPED_TRACE("key type: " + ty->ToString()); + + TestGrouper g({ty}); + + g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]"); + + g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]"); + + g.ExpectConsume(R"([["be"], [null]])", "[1, 2]"); + } +} + +TEST(Grouper, DictKey) { + TestGrouper g({dictionary(int32(), utf8())}); + + // For dictionary keys, all batches must share a single dictionary. + // Eventually, differing dictionaries will be unified and indices transposed + // during encoding to relieve this restriction. + const auto dict = ArrayFromJSON(utf8(), R"(["ex", "why", "zee", null])"); + + auto WithIndices = [&](const std::string& indices) { + return Datum(*DictionaryArray::FromArrays(ArrayFromJSON(int32(), indices), dict)); + }; + + // NB: null index is not considered equivalent to index=3 (which encodes null in dict) + g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")}, + ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]")); + + g = TestGrouper({dictionary(int32(), utf8())}); + + g.ExpectConsume({WithIndices(" [0, 1, 2, 3, null]")}, + ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]")); + + g.ExpectConsume({WithIndices(" [3, 1, null, 0, 2]")}, + ArrayFromJSON(uint32(), "[3, 1, 4, 0, 2]")); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, HasSubstr("Unifying differing dictionaries"), + g.grouper_->Consume(*ExecBatch::Make({*DictionaryArray::FromArrays( + ArrayFromJSON(int32(), "[0, 1]"), + ArrayFromJSON(utf8(), R"(["different", "dictionary"])"))}))); +} + +TEST(Grouper, StringInt64Key) { + TestGrouper g({utf8(), int64()}); + + g.ExpectConsume(R"([["eh", 0], ["eh", 0]])", "[0, 0]"); + + g.ExpectConsume(R"([["eh", 0], ["eh", null]])", "[0, 1]"); + + g.ExpectConsume(R"([["eh", 1], ["bee", 1]])", "[2, 3]"); + + g.ExpectConsume(R"([["eh", null], ["bee", 1]])", "[1, 3]"); + + g = TestGrouper({utf8(), int64()}); + + g.ExpectConsume(R"([ + ["ex", 0], + ["ex", 0], + ["why", 0], + ["ex", 1], + ["why", 0], + ["ex", 1], + ["ex", 0], + ["why", 1] + ])", + "[0, 0, 1, 2, 1, 2, 0, 3]"); + + g.ExpectConsume(R"([ + ["ex", 0], + [null, 0], + [null, 0], + ["ex", 1], + [null, null], + ["ex", 1], + ["ex", 0], + ["why", null] + ])", + "[0, 4, 4, 2, 5, 2, 0, 6]"); +} + +TEST(Grouper, DoubleStringInt64Key) { + TestGrouper g({float64(), utf8(), int64()}); + + g.ExpectConsume(R"([[1.5, "eh", 0], [1.5, "eh", 0]])", "[0, 0]"); + + g.ExpectConsume(R"([[1.5, "eh", 0], [1.5, "eh", 0]])", "[0, 0]"); + + g.ExpectConsume(R"([[1.0, "eh", 0], [1.0, "be", null]])", "[1, 2]"); + + // note: -0 and +0 hash differently + g.ExpectConsume(R"([[-0.0, "be", 7], [0.0, "be", 7]])", "[3, 4]"); +} + +TEST(Grouper, RandomInt64Keys) { + TestGrouper g({int64()}); + for (int i = 0; i < 4; ++i) { + SCOPED_TRACE(std::to_string(i) + "th key batch"); + + ExecBatch key_batch{ + *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)}; + g.ConsumeAndValidate(key_batch); + } +} + +TEST(Grouper, RandomStringInt64Keys) { + TestGrouper g({utf8(), int64()}); + for (int i = 0; i < 4; ++i) { + SCOPED_TRACE(std::to_string(i) + "th key batch"); + + ExecBatch key_batch{ + *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)}; + g.ConsumeAndValidate(key_batch); + } +} + +TEST(Grouper, RandomStringInt64DoubleInt32Keys) { + TestGrouper g({utf8(), int64(), float64(), int32()}); + for (int i = 0; i < 4; ++i) { + SCOPED_TRACE(std::to_string(i) + "th key batch"); + + ExecBatch key_batch{ + *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)}; + g.ConsumeAndValidate(key_batch); + } +} + +TEST(Grouper, MakeGroupings) { + auto ExpectGroupings = [](std::string ids_json, std::string expected_json) { + auto ids = checked_pointer_cast(ArrayFromJSON(uint32(), ids_json)); + auto expected = ArrayFromJSON(list(int32()), expected_json); + + auto num_groups = static_cast(expected->length()); + ASSERT_OK_AND_ASSIGN(auto actual, internal::Grouper::MakeGroupings(*ids, num_groups)); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); + + // validate ApplyGroupings + ASSERT_OK_AND_ASSIGN(auto grouped_ids, + internal::Grouper::ApplyGroupings(*actual, *ids)); + + for (uint32_t group = 0; group < num_groups; ++group) { + auto ids_slice = checked_pointer_cast(grouped_ids->value_slice(group)); + for (auto slot : *ids_slice) { + EXPECT_EQ(slot, group); + } + } + }; + + ExpectGroupings("[]", "[[]]"); + + ExpectGroupings("[0, 0, 0]", "[[0, 1, 2]]"); + + ExpectGroupings("[0, 0, 0, 1, 1, 2]", "[[0, 1, 2], [3, 4], [5], []]"); + + ExpectGroupings("[2, 1, 2, 1, 1, 2]", "[[], [1, 3, 4], [0, 2, 5], [], []]"); + + ExpectGroupings("[2, 2, 5, 5, 2, 3]", "[[], [], [0, 1, 4], [5], [], [2, 3], [], []]"); + + auto ids = checked_pointer_cast(ArrayFromJSON(uint32(), "[0, null, 1]")); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("MakeGroupings with null ids"), + internal::Grouper::MakeGroupings(*ids, 5)); +} + +TEST(GroupBy, Errors) { + auto batch = RecordBatchFromJSON( + schema({field("argument", float64()), field("group_id", uint32())}), R"([ + [1.0, 1], + [null, 1], + [0.0, 2], + [null, 3], + [4.0, 0], + [3.25, 1], + [0.125, 2], + [-0.25, 2], + [0.75, 0], + [null, 3] + ])"); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, HasSubstr("Direct execution of HASH_AGGREGATE functions"), + CallFunction("hash_sum", {batch->GetColumnByName("argument"), + batch->GetColumnByName("group_id"), Datum(uint32_t(4))})); +} + +TEST(GroupBy, SumOnly) { + auto batch = RecordBatchFromJSON( + schema({field("argument", float64()), field("key", int64())}), R"([ + [1.0, 1], + [null, 1], + [0.0, 2], + [null, 3], + [4.0, null], + [3.25, 1], + [0.125, 2], + [-0.25, 2], + [0.75, null], + [null, 3] + ])"); + + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + internal::GroupBy({batch->GetColumnByName("argument")}, + {batch->GetColumnByName("key")}, + { + {"hash_sum", nullptr}, + })); + + AssertDatumsEqual(ArrayFromJSON(struct_({ + field("hash_sum", float64()), + field("key_0", int64()), + }), + R"([ + [4.25, 1], + [-0.125, 2], + [null, 3], + [4.75, null] + ])"), + aggregated_and_grouped, + /*verbose=*/true); +} + +TEST(GroupBy, MinMaxOnly) { + auto batch = RecordBatchFromJSON( + schema({field("argument", float64()), field("key", int64())}), R"([ + [1.0, 1], + [null, 1], + [0.0, 2], + [null, 3], + [4.0, null], + [3.25, 1], + [0.125, 2], + [-0.25, 2], + [0.75, null], + [null, 3] + ])"); + + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + internal::GroupBy({batch->GetColumnByName("argument")}, + {batch->GetColumnByName("key")}, + { + {"hash_min_max", nullptr}, + })); + + AssertDatumsEqual(ArrayFromJSON(struct_({ + field("hash_min_max", struct_({ + field("min", float64()), + field("max", float64()), + })), + field("key_0", int64()), + }), + R"([ + [{"min": 1.0, "max": 3.25}, 1], + [{"min": -0.25, "max": 0.125}, 2], + [{"min": null, "max": null}, 3], + [{"min": 0.75, "max": 4.0}, null] + ])"), + aggregated_and_grouped, + /*verbose=*/true); +} + +TEST(GroupBy, CountAndSum) { + auto batch = RecordBatchFromJSON( + schema({field("argument", float64()), field("key", int64())}), R"([ + [1.0, 1], + [null, 1], + [0.0, 2], + [null, 3], + [4.0, null], + [3.25, 1], + [0.125, 2], + [-0.25, 2], + [0.75, null], + [null, 3] + ])"); + + CountOptions count_options; + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + internal::GroupBy( + { + // NB: passing an argument twice or also using it as a key is legal + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("key"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_count", &count_options}, + {"hash_sum", nullptr}, + {"hash_sum", nullptr}, + })); + + AssertDatumsEqual( + ArrayFromJSON(struct_({ + field("hash_count", int64()), + // NB: summing a float32 array results in float64 sums + field("hash_sum", float64()), + field("hash_sum", int64()), + field("key_0", int64()), + }), + R"([ + [2, 4.25, 3, 1], + [3, -0.125, 6, 2], + [0, null, 6, 3], + [2, 4.75, null, null] + ])"), + aggregated_and_grouped, + /*verbose=*/true); +} + +TEST(GroupBy, SumOnlyStringAndDictKeys) { + for (auto key_type : {utf8(), dictionary(int32(), utf8())}) { + SCOPED_TRACE("key type: " + key_type->ToString()); + + auto batch = RecordBatchFromJSON( + schema({field("argument", float64()), field("key", key_type)}), R"([ + [1.0, "alfa"], + [null, "alfa"], + [0.0, "beta"], + [null, "gama"], + [4.0, null ], + [3.25, "alfa"], + [0.125, "beta"], + [-0.25, "beta"], + [0.75, null ], + [null, "gama"] + ])"); + + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + internal::GroupBy({batch->GetColumnByName("argument")}, + {batch->GetColumnByName("key")}, + { + {"hash_sum", nullptr}, + })); + + AssertDatumsEqual(ArrayFromJSON(struct_({ + field("hash_sum", float64()), + field("key_0", key_type), + }), + R"([ + [4.25, "alfa"], + [-0.125, "beta"], + [null, "gama"], + [4.75, null ] + ])"), + aggregated_and_grouped, + /*verbose=*/true); + } +} + +TEST(GroupBy, ConcreteCaseWithValidateGroupBy) { + auto batch = RecordBatchFromJSON( + schema({field("argument", float64()), field("key", utf8())}), R"([ + [1.0, "alfa"], + [null, "alfa"], + [0.0, "beta"], + [null, "gama"], + [4.0, null ], + [3.25, "alfa"], + [0.125, "beta"], + [-0.25, "beta"], + [0.75, null ], + [null, "gama"] + ])"); + + CountOptions count_non_null{CountOptions::COUNT_NON_NULL}, + count_null{CountOptions::COUNT_NULL}; + + MinMaxOptions emit_null{MinMaxOptions::EMIT_NULL}; + + using internal::Aggregate; + for (auto agg : { + Aggregate{"hash_sum", nullptr}, + Aggregate{"hash_count", &count_non_null}, + Aggregate{"hash_count", &count_null}, + Aggregate{"hash_min_max", nullptr}, + Aggregate{"hash_min_max", &emit_null}, + }) { + SCOPED_TRACE(agg.function); + ValidateGroupBy({agg}, {batch->GetColumnByName("argument")}, + {batch->GetColumnByName("key")}); + } +} + +TEST(GroupBy, RandomArraySum) { + for (int64_t length : {1 << 10, 1 << 12, 1 << 15}) { + for (auto null_probability : {0.0, 0.01, 0.5, 1.0}) { + auto batch = random::GenerateBatch( + { + field("argument", float32(), + key_value_metadata( + {{"null_probability", std::to_string(null_probability)}})), + field("key", int64(), key_value_metadata({{"min", "0"}, {"max", "100"}})), + }, + length, 0xDEADBEEF); + + ValidateGroupBy( + { + {"hash_sum", nullptr}, + }, + {batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")}); + } + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index 77890d27da5..160c4ce8857 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -302,8 +302,8 @@ struct CastFunctor> { // Decimal to integer struct DecimalToIntegerMixin { - template - OutValue ToInteger(KernelContext* ctx, const Decimal128& val) const { + template + OutValue ToInteger(KernelContext* ctx, const Arg0Value& val) const { constexpr auto min_value = std::numeric_limits::min(); constexpr auto max_value = std::numeric_limits::max(); @@ -326,7 +326,7 @@ struct UnsafeUpscaleDecimalToInteger : public DecimalToIntegerMixin { using DecimalToIntegerMixin::DecimalToIntegerMixin; template - OutValue Call(KernelContext* ctx, Decimal128 val) const { + OutValue Call(KernelContext* ctx, Arg0Value val) const { return ToInteger(ctx, val.IncreaseScaleBy(-in_scale_)); } }; @@ -335,7 +335,7 @@ struct UnsafeDownscaleDecimalToInteger : public DecimalToIntegerMixin { using DecimalToIntegerMixin::DecimalToIntegerMixin; template - OutValue Call(KernelContext* ctx, Decimal128 val) const { + OutValue Call(KernelContext* ctx, Arg0Value val) const { return ToInteger(ctx, val.ReduceScaleBy(in_scale_, false)); } }; @@ -344,7 +344,7 @@ struct SafeRescaleDecimalToInteger : public DecimalToIntegerMixin { using DecimalToIntegerMixin::DecimalToIntegerMixin; template - OutValue Call(KernelContext* ctx, Decimal128 val) const { + OutValue Call(KernelContext* ctx, Arg0Value val) const { auto result = val.Rescale(in_scale_, 0); if (ARROW_PREDICT_FALSE(!result.ok())) { ctx->SetStatus(result.status()); @@ -355,35 +355,33 @@ struct SafeRescaleDecimalToInteger : public DecimalToIntegerMixin { } }; -template -struct CastFunctor::value>> { +template +struct CastFunctor::value && is_decimal_type::value>> { using out_type = typename O::c_type; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = checked_cast(ctx->state())->options; - const auto& in_type_inst = checked_cast(*batch[0].type()); + const auto& in_type_inst = checked_cast(*batch[0].type()); const auto in_scale = in_type_inst.scale(); if (options.allow_decimal_truncate) { if (in_scale < 0) { // Unsafe upscale - applicator::ScalarUnaryNotNullStateful + applicator::ScalarUnaryNotNullStateful kernel(UnsafeUpscaleDecimalToInteger{in_scale, options.allow_int_overflow}); return kernel.Exec(ctx, batch, out); } else { // Unsafe downscale - applicator::ScalarUnaryNotNullStateful + applicator::ScalarUnaryNotNullStateful kernel(UnsafeDownscaleDecimalToInteger{in_scale, options.allow_int_overflow}); return kernel.Exec(ctx, batch, out); } } else { // Safe rescale - applicator::ScalarUnaryNotNullStateful - kernel(SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow}); + applicator::ScalarUnaryNotNullStateful kernel( + SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow}); return kernel.Exec(ctx, batch, out); } } @@ -392,72 +390,104 @@ struct CastFunctor::value>> { // ---------------------------------------------------------------------- // Decimal to decimal +// Helper that converts the input and output decimals +// For instance, Decimal128 -> Decimal256 requires converting, then scaling +// Decimal256 -> Decimal128 requires scaling, then truncating +template +struct DecimalConversions {}; + +template +struct DecimalConversions { + // Convert then scale + static Decimal256 ConvertInput(InDecimal&& val) { return Decimal256(val); } + static Decimal256 ConvertOutput(Decimal256&& val) { return val; } +}; + +template <> +struct DecimalConversions { + // Scale then truncate + static Decimal256 ConvertInput(Decimal256&& val) { return val; } + static Decimal128 ConvertOutput(Decimal256&& val) { + return Decimal128(val.little_endian_array()[1], val.little_endian_array()[0]); + } +}; + +template <> +struct DecimalConversions { + static Decimal128 ConvertInput(Decimal128&& val) { return val; } + static Decimal128 ConvertOutput(Decimal128&& val) { return val; } +}; + struct UnsafeUpscaleDecimal { - template - Decimal128 Call(KernelContext* ctx, Decimal128 val) const { - return val.IncreaseScaleBy(by_); + template + OutValue Call(KernelContext* ctx, Arg0Value val) const { + using Conv = DecimalConversions; + return Conv::ConvertOutput(Conv::ConvertInput(std::move(val)).IncreaseScaleBy(by_)); } int32_t by_; }; struct UnsafeDownscaleDecimal { - template - Decimal128 Call(KernelContext* ctx, Decimal128 val) const { - return val.ReduceScaleBy(by_, false); + template + OutValue Call(KernelContext* ctx, Arg0Value val) const { + using Conv = DecimalConversions; + return Conv::ConvertOutput( + Conv::ConvertInput(std::move(val)).ReduceScaleBy(by_, false)); } int32_t by_; }; struct SafeRescaleDecimal { - template - Decimal128 Call(KernelContext* ctx, Decimal128 val) const { - auto maybe_rescaled = val.Rescale(in_scale_, out_scale_); + template + OutValue Call(KernelContext* ctx, Arg0Value val) const { + using Conv = DecimalConversions; + auto maybe_rescaled = + Conv::ConvertInput(std::move(val)).Rescale(in_scale_, out_scale_); if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) { ctx->SetStatus(maybe_rescaled.status()); return {}; // Zero } if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) { - return maybe_rescaled.MoveValueUnsafe(); + return Conv::ConvertOutput(maybe_rescaled.MoveValueUnsafe()); } - ctx->SetStatus(Status::Invalid("Decimal value does not fit in precision")); + ctx->SetStatus( + Status::Invalid("Decimal value does not fit in precision ", out_precision_)); return {}; // Zero } int32_t out_scale_, out_precision_, in_scale_; }; -template <> -struct CastFunctor { +template +struct CastFunctor::value && is_decimal_type::value>> { static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = checked_cast(ctx->state())->options; - const auto& in_type = checked_cast(*batch[0].type()); - const auto& out_type = checked_cast(*out->type()); + const auto& in_type = checked_cast(*batch[0].type()); + const auto& out_type = checked_cast(*out->type()); const auto in_scale = in_type.scale(); const auto out_scale = out_type.scale(); if (options.allow_decimal_truncate) { if (in_scale < out_scale) { // Unsafe upscale - applicator::ScalarUnaryNotNullStateful - kernel(UnsafeUpscaleDecimal{out_scale - in_scale}); + applicator::ScalarUnaryNotNullStateful kernel( + UnsafeUpscaleDecimal{out_scale - in_scale}); return kernel.Exec(ctx, batch, out); } else { // Unsafe downscale - applicator::ScalarUnaryNotNullStateful - kernel(UnsafeDownscaleDecimal{in_scale - out_scale}); + applicator::ScalarUnaryNotNullStateful kernel( + UnsafeDownscaleDecimal{in_scale - out_scale}); return kernel.Exec(ctx, batch, out); } } // Safe rescale - applicator::ScalarUnaryNotNullStateful - kernel(SafeRescaleDecimal{out_scale, out_type.precision(), in_scale}); + applicator::ScalarUnaryNotNullStateful kernel( + SafeRescaleDecimal{out_scale, out_type.precision(), in_scale}); return kernel.Exec(ctx, batch, out); } }; @@ -467,8 +497,8 @@ struct CastFunctor { struct RealToDecimal { template - Decimal128 Call(KernelContext* ctx, RealType val) const { - auto maybe_decimal = Decimal128::FromReal(val, out_precision_, out_scale_); + OutValue Call(KernelContext* ctx, RealType val) const { + auto maybe_decimal = OutValue::FromReal(val, out_precision_, out_scale_); if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) { return maybe_decimal.MoveValueUnsafe(); @@ -484,15 +514,16 @@ struct RealToDecimal { bool allow_truncate_; }; -template -struct CastFunctor::value>> { +template +struct CastFunctor::value && is_floating_type::value>> { static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = checked_cast(ctx->state())->options; - const auto& out_type = checked_cast(*out->type()); + const auto& out_type = checked_cast(*out->type()); const auto out_scale = out_type.scale(); const auto out_precision = out_type.precision(); - applicator::ScalarUnaryNotNullStateful kernel( + applicator::ScalarUnaryNotNullStateful kernel( RealToDecimal{out_scale, out_precision, options.allow_decimal_truncate}); return kernel.Exec(ctx, batch, out); } @@ -503,20 +534,21 @@ struct CastFunctor::value>> { struct DecimalToReal { template - RealType Call(KernelContext* ctx, const Decimal128& val) const { - return val.ToReal(in_scale_); + RealType Call(KernelContext* ctx, const Arg0Value& val) const { + return val.template ToReal(in_scale_); } int32_t in_scale_; }; -template -struct CastFunctor::value>> { +template +struct CastFunctor::value && is_decimal_type::value>> { static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& in_type = checked_cast(*batch[0].type()); + const auto& in_type = checked_cast(*batch[0].type()); const auto in_scale = in_type.scale(); - applicator::ScalarUnaryNotNullStateful kernel( + applicator::ScalarUnaryNotNullStateful kernel( DecimalToReal{in_scale}); return kernel.Exec(ctx, batch, out); } @@ -562,6 +594,8 @@ std::shared_ptr GetCastToInteger(std::string name) { // From decimal to integer DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty, CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty, + CastFunctor::Exec)); return func; } @@ -586,6 +620,8 @@ std::shared_ptr GetCastToFloating(std::string name) { // From decimal to floating point DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty, CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty, + CastFunctor::Exec)); return func; } @@ -606,6 +642,9 @@ std::shared_ptr GetCastToDecimal128() { // We resolve the output type of this kernel from the CastOptions DCHECK_OK( func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec)); return func; } @@ -613,10 +652,21 @@ std::shared_ptr GetCastToDecimal256() { OutputType sig_out_ty(ResolveOutputFromOptions); auto func = std::make_shared("cast_decimal256", Type::DECIMAL256); - // Needed for Parquet conversion. Full implementation is ARROW-10606 - // tracks full implementation. AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get()); + // Cast from floating point + DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty, + CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty, + CastFunctor::Exec)); + + // Cast from other decimal + auto exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec)); return func; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 99a56346c1b..10e5ed26e5d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -404,7 +404,7 @@ TEST(Cast, IntToFloating) { CastOptions::Safe(float64())); } -TEST(Cast, DecimalToInt) { +TEST(Cast, Decimal128ToInt) { auto options = CastOptions::Safe(int64()); for (bool allow_int_overflow : {false, true}) { @@ -494,7 +494,98 @@ TEST(Cast, DecimalToInt) { CheckCast(negative_scale, ArrayFromJSON(int64(), "[1234567890000, -120000]"), options); } -TEST(Cast, DecimalToDecimal) { +TEST(Cast, Decimal256ToInt) { + auto options = CastOptions::Safe(int64()); + + for (bool allow_int_overflow : {false, true}) { + for (bool allow_decimal_truncate : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_overflow_no_truncation = ArrayFromJSON(decimal256(40, 10), R"([ + "02.0000000000", + "-11.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + CheckCast(no_overflow_no_truncation, + ArrayFromJSON(int64(), "[2, -11, 22, -121, null]"), options); + } + } + + for (bool allow_int_overflow : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + auto truncation_but_no_overflow = ArrayFromJSON(decimal256(40, 10), R"([ + "02.1000000000", + "-11.0000004500", + "22.0000004500", + "-121.1210000000", + null])"); + + options.allow_decimal_truncate = true; + CheckCast(truncation_but_no_overflow, + ArrayFromJSON(int64(), "[2, -11, 22, -121, null]"), options); + + options.allow_decimal_truncate = false; + CheckCastFails(truncation_but_no_overflow, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto overflow_no_truncation = ArrayFromJSON(decimal256(40, 10), R"([ + "1234567890123456789000000.0000000000", + "9999999999999999999999999.0000000000", + null])"); + + options.allow_int_overflow = true; + CheckCast(overflow_no_truncation, + ArrayFromJSON( + int64(), + // 1234567890123456789000000 % 2**64, 9999999999999999999999999 % 2**64 + "[1096246371337547584, 1590897978359414783, null]"), + options); + + options.allow_int_overflow = false; + CheckCastFails(overflow_no_truncation, options); + } + + for (bool allow_int_overflow : {false, true}) { + for (bool allow_decimal_truncate : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + options.allow_decimal_truncate = allow_decimal_truncate; + + auto overflow_and_truncation = ArrayFromJSON(decimal256(40, 10), R"([ + "1234567890123456789000000.0045345000", + "9999999999999999999999999.0000344300", + null])"); + + if (options.allow_int_overflow && options.allow_decimal_truncate) { + CheckCast( + overflow_and_truncation, + ArrayFromJSON( + int64(), + // 1234567890123456789000000 % 2**64, 9999999999999999999999999 % 2**64 + "[1096246371337547584, 1590897978359414783, null]"), + options); + } else { + CheckCastFails(overflow_and_truncation, options); + } + } + } + + Decimal256Builder builder(decimal256(40, -4)); + for (auto d : {Decimal256("1234567890000."), Decimal256("-120000.")}) { + ASSERT_OK_AND_ASSIGN(d, d.Rescale(0, -4)); + ASSERT_OK(builder.Append(d)); + } + ASSERT_OK_AND_ASSIGN(auto negative_scale, builder.Finish()); + options.allow_int_overflow = true; + options.allow_decimal_truncate = true; + CheckCast(negative_scale, ArrayFromJSON(int64(), "[1234567890000, -120000]"), options); +} + +TEST(Cast, Decimal128ToDecimal128) { CastOptions options; for (bool allow_decimal_truncate : {false, true}) { @@ -573,51 +664,306 @@ TEST(Cast, DecimalToDecimal) { } } -TEST(Cast, FloatingToDecimal) { - for (auto float_type : {float32(), float64()}) { - CheckCast( - ArrayFromJSON(float_type, "[0.0, null, 123.45, 123.456, 999.994]"), - ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "123.45", "123.46", "999.99"])")); +TEST(Cast, Decimal256ToDecimal256) { + CastOptions options; - // Overflow - CastOptions options; - options.to_type = decimal(5, 2); - CheckCastFails(ArrayFromJSON(float_type, "[999.996]"), options); + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal256(38, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal256(28, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + CheckCast(expected, no_truncation, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal256(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_4_2, d_5_2, options); + } + + auto d_38_10 = ArrayFromJSON(decimal256(38, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d_28_0 = ArrayFromJSON(decimal256(28, 0), R"([ + "-02.", + "30.", + null])"); + + auto d_38_10_roundtripped = ArrayFromJSON(decimal256(38, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d_38_10, d_28_0, options); + CheckCast(d_28_0, d_38_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d_28_0->type(); + CheckCastFails(d_38_10, options); + CheckCast(d_28_0, d_38_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal256(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal256(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"), + }) { options.allow_decimal_truncate = true; - CheckCast( - ArrayFromJSON(float_type, "[0.0, null, 999.996, 123.45, 999.994]"), - ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "0.00", "123.45", "999.99"])"), - options); + CheckCast(d_4_2, expected, options); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d_4_2, options); } +} - // 2**64 + 2**41 (exactly representable as a float) - CheckCast(ArrayFromJSON(float32(), "[1.8446746e+19, -1.8446746e+19]"), - ArrayFromJSON(decimal(20, 0), - R"(["18446746272732807168", "-18446746272732807168"])")); +TEST(Cast, Decimal128ToDecimal256) { + CastOptions options; - CheckCast(ArrayFromJSON(float32(), "[1.8446746e+15, -1.8446746e+15]"), - ArrayFromJSON(decimal(20, 4), - R"(["1844674627273280.7168", "-1844674627273280.7168"])")); + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; - CheckCast(ArrayFromJSON(float64(), "[1.8446744073709556e+19, -1.8446744073709556e+19]"), - ArrayFromJSON(decimal(20, 0), - R"(["18446744073709555712", "-18446744073709555712"])")); + auto no_truncation = ArrayFromJSON(decimal(38, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal256(48, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([ + "12.34", + "0.56"])"); + auto d_40_2 = ArrayFromJSON(decimal256(40, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_5_2, d_40_2, options); + } + + auto d128_38_10 = ArrayFromJSON(decimal(38, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d128_28_0 = ArrayFromJSON(decimal(28, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_28_0 = ArrayFromJSON(decimal256(28, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_38_10_roundtripped = ArrayFromJSON(decimal256(38, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); - CheckCast(ArrayFromJSON(float64(), "[1.8446744073709556e+15, -1.8446744073709556e+15]"), - ArrayFromJSON(decimal(20, 4), - R"(["1844674407370955.5712", "-1844674407370955.5712"])")); + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d128_38_10, d256_28_0, options); + CheckCast(d128_28_0, d256_38_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d256_28_0->type(); + CheckCastFails(d128_38_10, options); + CheckCast(d128_28_0, d256_38_10_roundtripped, options); - // Edge cases are tested for Decimal128::FromReal() + // Precision loss without rescale leads to truncation + auto d128_4_2 = ArrayFromJSON(decimal(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal256(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal256(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + CheckCast(d128_4_2, expected, options); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d128_4_2, options); + } +} + +TEST(Cast, Decimal256ToDecimal128) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal256(42, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal(28, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal256(42, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + } + + auto d256_52_10 = ArrayFromJSON(decimal256(52, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d256_42_0 = ArrayFromJSON(decimal256(42, 0), R"([ + "-02.", + "30.", + null])"); + + auto d128_28_0 = ArrayFromJSON(decimal(28, 0), R"([ + "-02.", + "30.", + null])"); + + auto d128_38_10_roundtripped = ArrayFromJSON(decimal(38, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d256_52_10, d128_28_0, options); + CheckCast(d256_42_0, d128_38_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d128_28_0->type(); + CheckCastFails(d256_52_10, options); + CheckCast(d256_42_0, d128_38_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d256_4_2 = ArrayFromJSON(decimal256(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + CheckCast(d256_4_2, expected, options); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d256_4_2, options); + } +} + +TEST(Cast, FloatingToDecimal) { + for (auto float_type : {float32(), float64()}) { + for (auto decimal_type : {decimal(5, 2), decimal256(5, 2)}) { + CheckCast( + ArrayFromJSON(float_type, "[0.0, null, 123.45, 123.456, 999.994]"), + ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "123.46", "999.99"])")); + + // Overflow + CastOptions options; + options.to_type = decimal_type; + CheckCastFails(ArrayFromJSON(float_type, "[999.996]"), options); + + options.allow_decimal_truncate = true; + CheckCast( + ArrayFromJSON(float_type, "[0.0, null, 999.996, 123.45, 999.994]"), + ArrayFromJSON(decimal_type, R"(["0.00", null, "0.00", "123.45", "999.99"])"), + options); + } + } + + for (auto decimal_type : {decimal128, decimal256}) { + // 2**64 + 2**41 (exactly representable as a float) + CheckCast(ArrayFromJSON(float32(), "[1.8446746e+19, -1.8446746e+19]"), + ArrayFromJSON(decimal_type(20, 0), + R"(["18446746272732807168", "-18446746272732807168"])")); + + CheckCast( + ArrayFromJSON(float64(), "[1.8446744073709556e+19, -1.8446744073709556e+19]"), + ArrayFromJSON(decimal_type(20, 0), + R"(["18446744073709555712", "-18446744073709555712"])")); + + CheckCast(ArrayFromJSON(float32(), "[1.8446746e+15, -1.8446746e+15]"), + ArrayFromJSON(decimal_type(20, 4), + R"(["1844674627273280.7168", "-1844674627273280.7168"])")); + + CheckCast( + ArrayFromJSON(float64(), "[1.8446744073709556e+15, -1.8446744073709556e+15]"), + ArrayFromJSON(decimal_type(20, 4), + R"(["1844674407370955.5712", "-1844674407370955.5712"])")); + + // Edge cases are tested for Decimal128::FromReal() and Decimal256::FromReal + } } TEST(Cast, DecimalToFloating) { for (auto float_type : {float32(), float64()}) { - CheckCast(ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "123.45", "999.99"])"), - ArrayFromJSON(float_type, "[0.0, null, 123.45, 999.99]")); + for (auto decimal_type : {decimal(5, 2), decimal256(5, 2)}) { + CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"), + ArrayFromJSON(float_type, "[0.0, null, 123.45, 999.99]")); + } } - // Edge cases are tested for Decimal128::ToReal() + // Edge cases are tested for Decimal128::ToReal() and Decimal256::ToReal() } TEST(Cast, TimestampToTimestamp) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 2eeac71c727..39869879561 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -23,6 +23,10 @@ #include #endif +#ifdef ARROW_WITH_RE2 +#include +#endif + #include "arrow/array/builder_binary.h" #include "arrow/array/builder_nested.h" #include "arrow/buffer_builder.h" @@ -64,6 +68,22 @@ struct BinaryLength { } }; +struct Utf8Length { + template + static OutValue Call(KernelContext*, Arg0Value val) { + auto str = reinterpret_cast(val.data()); + auto strlen = val.size(); + + OutValue length = 0; + while (strlen > 0) { + length += ((*str & 0xc0) != 0x80); + ++str; + --strlen; + } + return length; + } +}; + #ifdef ARROW_WITH_UTF8PROC // Direct lookup tables for unicode properties @@ -1214,6 +1234,197 @@ void AddSplit(FunctionRegistry* registry) { #endif } +// ---------------------------------------------------------------------- +// Replace substring (plain, regex) + +template +struct ReplaceSubString { + using ScalarType = typename TypeTraits::ScalarType; + using offset_type = typename Type::offset_type; + using ValueDataBuilder = TypedBufferBuilder; + using OffsetBuilder = TypedBufferBuilder; + using State = OptionsWrapper; + + static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // TODO Cache replacer accross invocations (for regex compilation) + Replacer replacer{ctx, State::Get(ctx)}; + if (!ctx->HasError()) { + Replace(ctx, batch, &replacer, out); + } + } + + static void Replace(KernelContext* ctx, const ExecBatch& batch, Replacer* replacer, + Datum* out) { + ValueDataBuilder value_data_builder(ctx->memory_pool()); + OffsetBuilder offset_builder(ctx->memory_pool()); + + if (batch[0].kind() == Datum::ARRAY) { + // We already know how many strings we have, so we can use Reserve/UnsafeAppend + KERNEL_RETURN_IF_ERROR(ctx, offset_builder.Reserve(batch[0].array()->length)); + offset_builder.UnsafeAppend(0); // offsets start at 0 + + const ArrayData& input = *batch[0].array(); + KERNEL_RETURN_IF_ERROR( + ctx, VisitArrayDataInline( + input, + [&](util::string_view s) { + RETURN_NOT_OK(replacer->ReplaceString(s, &value_data_builder)); + offset_builder.UnsafeAppend( + static_cast(value_data_builder.length())); + return Status::OK(); + }, + [&]() { + // offset for null value + offset_builder.UnsafeAppend( + static_cast(value_data_builder.length())); + return Status::OK(); + })); + ArrayData* output = out->mutable_array(); + KERNEL_RETURN_IF_ERROR(ctx, value_data_builder.Finish(&output->buffers[2])); + KERNEL_RETURN_IF_ERROR(ctx, offset_builder.Finish(&output->buffers[1])); + } else { + const auto& input = checked_cast(*batch[0].scalar()); + auto result = std::make_shared(); + if (input.is_valid) { + util::string_view s = static_cast(*input.value); + KERNEL_RETURN_IF_ERROR(ctx, replacer->ReplaceString(s, &value_data_builder)); + KERNEL_RETURN_IF_ERROR(ctx, value_data_builder.Finish(&result->value)); + result->is_valid = true; + } + out->value = result; + } + } +}; + +struct PlainSubStringReplacer { + const ReplaceSubstringOptions& options_; + + PlainSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& options) + : options_(options) {} + + Status ReplaceString(util::string_view s, TypedBufferBuilder* builder) { + const char* i = s.begin(); + const char* end = s.end(); + int64_t max_replacements = options_.max_replacements; + while ((i < end) && (max_replacements != 0)) { + const char* pos = + std::search(i, end, options_.pattern.begin(), options_.pattern.end()); + if (pos == end) { + RETURN_NOT_OK(builder->Append(reinterpret_cast(i), + static_cast(end - i))); + i = end; + } else { + // the string before the pattern + RETURN_NOT_OK(builder->Append(reinterpret_cast(i), + static_cast(pos - i))); + // the replacement + RETURN_NOT_OK( + builder->Append(reinterpret_cast(options_.replacement.data()), + options_.replacement.length())); + // skip pattern + i = pos + options_.pattern.length(); + max_replacements--; + } + } + // if we exited early due to max_replacements, add the trailing part + RETURN_NOT_OK(builder->Append(reinterpret_cast(i), + static_cast(end - i))); + return Status::OK(); + } +}; + +#ifdef ARROW_WITH_RE2 +struct RegexSubStringReplacer { + const ReplaceSubstringOptions& options_; + const RE2 regex_find_; + const RE2 regex_replacement_; + + // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore + // we have 2 regexes, one with () around it, one without. + RegexSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& options) + : options_(options), + regex_find_("(" + options_.pattern + ")"), + regex_replacement_(options_.pattern) { + if (!(regex_find_.ok() && regex_replacement_.ok())) { + ctx->SetStatus(Status::Invalid("Regular expression error")); + return; + } + } + + Status ReplaceString(util::string_view s, TypedBufferBuilder* builder) { + re2::StringPiece replacement(options_.replacement); + if (options_.max_replacements == -1) { + std::string s_copy(s.to_string()); + re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement); + RETURN_NOT_OK(builder->Append(reinterpret_cast(s_copy.data()), + s_copy.length())); + return Status::OK(); + } + + // Since RE2 does not have the concept of max_replacements, we have to do some work + // ourselves. + // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite + const char* i = s.begin(); + const char* end = s.end(); + re2::StringPiece piece(s.data(), s.length()); + + int64_t max_replacements = options_.max_replacements; + while ((i < end) && (max_replacements != 0)) { + std::string found; + if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) { + RETURN_NOT_OK(builder->Append(reinterpret_cast(i), + static_cast(end - i))); + i = end; + } else { + // wind back to the beginning of the match + const char* pos = piece.begin() - found.length(); + // the string before the pattern + RETURN_NOT_OK(builder->Append(reinterpret_cast(i), + static_cast(pos - i))); + // replace the pattern in what we found + if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) { + return Status::Invalid("Regex found, but replacement failed"); + } + RETURN_NOT_OK(builder->Append(reinterpret_cast(found.data()), + static_cast(found.length()))); + // skip pattern + i = piece.begin(); + max_replacements--; + } + } + // If we exited early due to max_replacements, add the trailing part + RETURN_NOT_OK(builder->Append(reinterpret_cast(i), + static_cast(end - i))); + return Status::OK(); + } +}; +#endif + +template +using ReplaceSubStringPlain = ReplaceSubString; + +const FunctionDoc replace_substring_doc( + "Replace non-overlapping substrings that match pattern by replacement", + ("For each string in `strings`, replace non-overlapping substrings that match\n" + "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n" + "maximum amount of replacements made, counting from the left. Null values emit\n" + "null."), + {"strings"}, "ReplaceSubstringOptions"); + +#ifdef ARROW_WITH_RE2 +template +using ReplaceSubStringRegex = ReplaceSubString; + +const FunctionDoc replace_substring_regex_doc( + "Replace non-overlapping substrings that match regex `pattern` by `replacement`", + ("For each string in `strings`, replace non-overlapping substrings that match the\n" + "regular expression `pattern` by `replacement` using the Google RE2 library.\n" + "If `max_replacements != -1`, it determines the maximum amount of replacements\n" + "made, counting from the left. Note that if the pattern contains groups,\n" + "backreferencing macan be used. Null values emit null."), + {"strings"}, "ReplaceSubstringOptions"); +#endif + // ---------------------------------------------------------------------- // strptime string parsing @@ -1569,9 +1780,14 @@ const FunctionDoc strptime_doc( const FunctionDoc binary_length_doc( "Compute string lengths", - ("For each string in `strings`, emit its length. Null values emit null."), + ("For each string in `strings`, emit the number of bytes. Null values emit null."), {"strings"}); +const FunctionDoc utf8_length_doc("Compute UTF8 string lengths", + ("For each string in `strings`, emit the number of " + "UTF8 characters. Null values emit null."), + {"strings"}); + void AddStrptime(FunctionRegistry* registry) { auto func = std::make_shared("strptime", Arity::Unary(), &strptime_doc); DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve), @@ -1597,6 +1813,21 @@ void AddBinaryLength(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +void AddUtf8Length(FunctionRegistry* registry) { + auto func = + std::make_shared("utf8_length", Arity::Unary(), &utf8_length_doc); + + ArrayKernelExec exec_offset_32 = + applicator::ScalarUnaryNotNull::Exec; + DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32))); + + ArrayKernelExec exec_offset_64 = + applicator::ScalarUnaryNotNull::Exec; + DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64))); + + DCHECK_OK(registry->AddFunction(std::move(func))); +} + template