diff --git a/.env b/.env index aa9e9c76670..cd6b57e004a 100644 --- a/.env +++ b/.env @@ -33,9 +33,13 @@ COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 BUILDKIT_INLINE_CACHE=1 +# different architecture notations +ARCH=amd64 +ARCH_ALIAS=x86_64 +ARCH_SHORT_ALIAS=x64 + ULIMIT_CORE=-1 REPO=apache/arrow-dev -ARCH=amd64 CUDA=9.1 DEBIAN=10 UBUNTU=18.04 @@ -44,7 +48,7 @@ PYTHON=3.6 LLVM=11 CLANG_TOOLS=8 RUST=nightly-2020-11-24 -GO=1.12 +GO=1.15 NODE=14 MAVEN=3.5.4 JDK=8 @@ -65,5 +69,6 @@ R_TAG=latest # -1 does not attempt to install a devtoolset version, any positive integer will install devtoolset-n DEVTOOLSET_VERSION=-1 -# Used for the manylinux and windows wheels -VCPKG=c7e96f2a5b73b3278b004aa88abec2f8ebfb43b5 +# Used for the manylinux and windows wheels, please update the crossbow configuration on update: +# https://github.com/ursacomputing/crossbow/blob/master/.github/workflows/cache_vcpkg.yml +VCPKG=fced4bef1606260f110d74de1ae1975c2b9ac549 diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 60a96081a19..574795f5e9b 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -45,7 +45,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.12] + go: [1.15] env: GO: ${{ matrix.go }} steps: @@ -77,7 +77,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.12] + go: [1.15] steps: - name: Install go uses: actions/setup-go@v1 @@ -104,7 +104,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.12] + go: [1.15] steps: - name: Install go uses: actions/setup-go@v1 diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8869de77b34..e4aefbb5500 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -20,24 +20,24 @@ name: R on: push: paths: - - '.github/workflows/r.yml' - - 'ci/scripts/r_*.sh' - - 'ci/scripts/cpp_*.sh' - - 'ci/scripts/PKGBUILD' - - 'ci/etc/rprofile' - - 'ci/docker/**' - - 'cpp/**' - - 'r/**' + - ".github/workflows/r.yml" + - "ci/scripts/r_*.sh" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/PKGBUILD" + - "ci/etc/rprofile" + - "ci/docker/**" + - "cpp/**" + - "r/**" pull_request: paths: - - '.github/workflows/r.yml' - - 'ci/scripts/r_*.sh' - - 'ci/scripts/cpp_*.sh' - - 'ci/scripts/PKGBUILD' - - 'ci/etc/rprofile' - - 'ci/docker/**' - - 'cpp/**' - - 'r/**' + - ".github/workflows/r.yml" + - "ci/scripts/r_*.sh" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/PKGBUILD" + - "ci/etc/rprofile" + - "ci/docker/**" + - "cpp/**" + - "r/**" env: DOCKER_VOLUME_PREFIX: ".docker/" @@ -99,8 +99,8 @@ jobs: fail-fast: false matrix: config: - - {org: 'rstudio', image: 'r-base', tag: '4.0-centos7'} - - {org: 'rhub', image: 'debian-gcc-devel', tag: 'latest'} + - { org: "rstudio", image: "r-base", tag: "4.0-centos7" } + - { org: "rhub", image: "debian-gcc-devel", tag: "latest" } env: R_ORG: ${{ matrix.config.org }} R_IMAGE: ${{ matrix.config.image }} @@ -149,8 +149,9 @@ jobs: rtools: [35, 40] env: TEST_R_WITH_ARROW: "TRUE" - ARROW_R_CXXFLAGS: '-Werror' + ARROW_R_CXXFLAGS: "-Werror" _R_CHECK_TESTS_NLINES_: 0 + ARROW_R_DEV: "TRUE" steps: - run: git config --global core.autocrlf false - name: Checkout Arrow @@ -187,14 +188,17 @@ jobs: - uses: r-lib/actions/setup-r@master with: rtools-version: 40 - r-version: '4.0' + r-version: "4.0" Ncpus: 2 - uses: r-lib/actions/setup-r@master if: ${{ matrix.rtools == 35 }} with: rtools-version: 35 - r-version: '3.6' + r-version: "3.6" Ncpus: 2 + - name: Print Rscript location + shell: bash + run: which Rscript - name: Build Arrow C++ shell: bash env: @@ -227,6 +231,22 @@ jobs: shell: cmd run: cat check/arrow.Rcheck/00install.out if: always() + - name: Dump i386 tests + shell: cmd + run: cat check/arrow.Rcheck/tests_i386/testthat.Rout.fail + if: always() + - name: Dump other i386 stuff + shell: cmd + run: cat check/arrow.Rcheck/arrow-Ex_i386.Rout + if: always() + - name: Dump even more stuff + shell: cmd + run: cat check/arrow.Rcheck/00check.log + if: always() + - name: Dump x64 tests + shell: cmd + run: cat check/arrow.Rcheck/tests_x64/testthat.Rout* + if: always() # We can remove this when we drop support for Rtools 3.5. - name: Ensure using system tar in actions/cache run: | diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9e36cf92987..1dd220ade94 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -66,6 +66,8 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" + # do not produce debug symbols to keep memory usage down + export RUSTFLAGS="-C debuginfo=0" cd rust cargo build @@ -108,11 +110,14 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" + # do not produce debug symbols to keep memory usage down + export RUSTFLAGS="-C debuginfo=0" cd rust # run tests on all workspace members with default feature list cargo test # test datafusion examples - cd datafusion + cd datafusion-examples + cargo test --no-default-features cargo run --example csv_sql cargo run --example parquet_sql cd .. @@ -161,6 +166,8 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" + # do not produce debug symbols to keep memory usage down + export RUSTFLAGS="-C debuginfo=0" cd rust/arrow cargo test --features "simd" @@ -387,5 +394,7 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" + # do not produce debug symbols to keep memory usage down + export RUSTFLAGS="-C debuginfo=0" cd rust/arrow cargo build --target wasm32-unknown-unknown diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e70eaceaf41..9d2d2d81d68 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,9 +40,10 @@ repos: - id: cmake-format name: CMake Format language: python - entry: bash -c "pip install cmake-format && python run-cmake-format.py --check" - entry: echo - files: ^(.*/CMakeLists.txt|.*.cmake)$ + entry: python run-cmake-format.py + types: [cmake] + additional_dependencies: + - cmake_format==0.5.2 - id: hadolint name: Docker Format language: docker_image diff --git a/.travis.yml b/.travis.yml index 57646246c4a..2cf70cca982 100644 --- a/.travis.yml +++ b/.travis.yml @@ -125,7 +125,8 @@ jobs: JDK: 11 allow_failures: - - arch: s390x + - name: "Go on s390x" + - name: "Java on s390x" before_install: - eval "$(python ci/detect-changes.py)" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1df5ea3e435..380886872fa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,6 +27,13 @@ to first create an account on the hosts bugs and issues for multiple Apache projects. The JIRA project name for Arrow is "ARROW". +To be assigned to an issue, ask an Arrow JIRA admin to go to +[Arrow Roles](https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/roles), +click "Add users to a role," and add you to the "Contributor" role. Most +committers are authorized to do this; if you're a committer and aren't +able to load that project admin page, have someone else add you to the +necessary role. + Before you create a new bug entry, we recommend you first [search](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-5140?filter=allopenissues) among existing Arrow issues. diff --git a/LICENSE.txt b/LICENSE.txt index 4c2d96e6496..1480c1401c0 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2119,6 +2119,22 @@ DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- +This project includes code from Folly. + + * cpp/src/arrow/vendored/ProducerConsumerQueue.h + +is based on Folly's + + * folly/Portability.h + * folly/lang/Align.h + * folly/ProducerConsumerQueue.h + +Copyright: Copyright (c) Facebook, Inc. and its affiliates. +Home page: https://github.com/facebook/folly +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + The file cpp/src/arrow/vendored/musl/strptime.c has the following license Copyright © 2005-2020 Rich Felker, et al. diff --git a/c_glib/README.md b/c_glib/README.md index 4ef9612c868..ac179354d8f 100644 --- a/c_glib/README.md +++ b/c_glib/README.md @@ -19,12 +19,14 @@ # Arrow GLib -Arrow GLib is a wrapper library for [Arrow C++](https://github.com/apache/arrow/tree/master/cpp). Arrow GLib provides C -API. +Arrow GLib is a wrapper library for [Arrow +C++](https://github.com/apache/arrow/tree/master/cpp). Arrow GLib +provides C API. -Arrow GLib supports -[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection). -It means that you can create language bindings at runtime or compile time. +Arrow GLib supports [GObject +Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection). +It means that you can create language bindings at runtime or compile +time. For example, you can use Apache Arrow from Ruby by Arrow GLib and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) @@ -50,71 +52,47 @@ gobject-introspection gem based bindings. You can use packages or build by yourself to install Arrow GLib. It's recommended that you use packages. -Note that the packages are "unofficial". "Official" packages will be -released in the future. +We use Meson and Ninja as build tools. If you find problems when +installing please see [common build +problems](https://github.com/apache/arrow/blob/master/c_glib/README.md#common-build-problems). -We support two build systems, GNU Autotools and Meson. If you find problems when installing please see [common build problems](https://github.com/apache/arrow/blob/master/c_glib/README.md#common-build-problems). - -### Package +### Packages See [install document](https://arrow.apache.org/install/) for details. ### How to build by users Arrow GLib users should use released source archive to build Arrow -GLib (replace the version number in the following commands with the one you use): +GLib (replace the version number in the following commands with the +one you use): ```console -% wget https://archive.apache.org/dist/arrow/arrow-0.3.0/apache-arrow-0.3.0.tar.gz -% tar xf apache-arrow-0.3.0.tar.gz -% cd apache-arrow-0.3.0 +% wget https://downloads.apache.org/arrow/arrow-3.0.0/apache-arrow-3.0.0.tar.gz +% tar xf apache-arrow-3.0.0.tar.gz +% cd apache-arrow-3.0.0 ``` You need to build and install Arrow C++ before you build and install Arrow GLib. See Arrow C++ document about how to install Arrow C++. -If you use macOS with [Homebrew](https://brew.sh/), you must install required packages and set `PKG_CONFIG_PATH` before build Arrow GLib: - -If you use GNU Autotools, you can build and install Arrow GLib by the followings: - -macOS: - -```console -% cd c_glib -% brew bundle -% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH -% make -% sudo make install -``` - -Others: - -```console -% cd c_glib -% ./configure -% make -% sudo make install -``` - -If you use Meson, you can build and install Arrow GLib by the followings: +If you use macOS with [Homebrew](https://brew.sh/), you must install +required packages. macOS: ```console -% cd c_glib -% brew bundle -% PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH meson build --buildtype=release -% ninja -C build -% sudo ninja -C build install +$ brew bundle +$ meson setup c_glib.build c_glib --buildtype=release +$ meson compile -C c_glib.build +$ sudo meson install -C c_glib.build ``` Others: ```console -% cd c_glib -% meson build --buildtype=release -% ninja -C build -% sudo ninja -C build install +$ meson setup c_glib.build c_glib --buildtype=release +$ meson compile -C c_glib.build +$ sudo meson install -C build ``` ### How to build by developers @@ -129,51 +107,46 @@ to build Arrow GLib. You can install them by the followings: On Debian GNU/Linux or Ubuntu: ```console -% sudo apt install -y -V gtk-doc-tools autoconf-archive libgirepository1.0-dev meson ninja-build +$ sudo apt install -y -V gtk-doc-tools libgirepository1.0-dev meson ninja-build ``` -On CentOS 7 or later: +On CentOS 7: ```console -% sudo yum install -y gtk-doc gobject-introspection-devel -% sudo pip install -y meson ninja -``` - -On macOS with [Homebrew](https://brew.sh/): - -```text -% brew bundle +$ sudo yum install -y gtk-doc gobject-introspection-devel ninja-build +$ sudo pip3 install meson ``` -If you use GNU Autotools, you can build and install Arrow GLib by the followings: +On CentOS 8 or later: ```console -% cd c_glib -% ./autogen.sh -% ./configure --enable-gtk-doc -% make -% sudo make install +$ sudo dnf install -y --enablerepo=powertools gtk-doc gobject-introspection-devel ninja-build +$ sudo pip3 install meson ``` -You need to set `PKG_CONFIG_PATH` to `configure` On macOS: +On macOS with [Homebrew](https://brew.sh/): ```console -% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH --enable-gtk-doc +$ brew bundle ``` -If you use Meson, you can build and install Arrow GLib by the followings: +You can build and install Arrow GLib by the followings: + +macOS: ```console -% cd c_glib -% meson build -Dgtk_doc=true -% ninja -C build -% sudo ninja -C build install +$ XML_CATALOG_FILES=$(brew --prefix)/etc/xml/catalog +$ meson setup c_glib.build c_glib -Dgtk_doc=true +$ meson compile -C c_glib.build +$ sudo meson install -C c_glib.build ``` -You need to set `PKG_CONFIG_PATH` on macOS: +Others: ```console -% PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH meson build -Dgtk_doc=true +$ meson c_glib.build c_glib -Dgtk_doc=true +$ meson compile -C c_glib.build +$ sudo meson install -C c_glib.build ``` ## Usage @@ -186,7 +159,7 @@ languages, you use GObject Introspection based bindings. You can find API reference in the `/usr/local/share/gtk-doc/html/arrow-glib/` directory. If you specify -`--prefix` to `configure`, the directory will be different. +`--prefix` to `meson`, the directory will be different. You can find example codes in the `example/` directory. @@ -225,101 +198,118 @@ You can install them by the followings: On Debian GNU/Linux or Ubuntu: ```console -% sudo apt install -y -V ruby-dev -% sudo gem install bundler -% (cd c_glib && bundle install) +$ sudo apt install -y -V ruby-dev +$ sudo gem install bundler +$ (cd c_glib && bundle install) ``` On CentOS 7 or later: ```console -% sudo yum install -y git -% git clone https://github.com/sstephenson/rbenv.git ~/.rbenv -% git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build -% echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile -% echo 'eval "$(rbenv init -)"' >> ~/.bash_profile -% exec ${SHELL} --login -% sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel -% rbenv install 2.4.1 -% rbenv global 2.4.1 -% gem install bundler -% (cd c_glib && bundle install) +$ sudo yum install -y git +$ git clone https://github.com/sstephenson/rbenv.git ~/.rbenv +$ git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build +$ echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile +$ echo 'eval "$(rbenv init -)"' >> ~/.bash_profile +$ exec ${SHELL} --login +$ sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel +$ latest_ruby_version=$(rbenv install --list 2>&1 | grep '^[0-9]' | tail -n1) +$ rbenv install ${latest_ruby_version} +$ rbenv global ${latest_ruby_version} +$ gem install bundler +$ (cd c_glib && bundle install) ``` On macOS with [Homebrew](https://brew.sh/): ```console -% (cd c_glib && bundle install) +$ (cd c_glib && bundle install) ``` Now, you can run unit tests by the followings: ```console -% cd c_glib -% bundle exec test/run-test.sh +$ cd c_glib.build +$ bundle exec ../c_glib/test/run-test.sh ``` ## Common build problems -### configure failed - `AX_CXX_COMPILE_STDCXX_11(ext, mandatory)' +### build failed - /usr/bin/ld: cannot find -larrow -* Check whether `autoconf-archive` is installed. -* [macOS] `autoconf-archive` must be linked, but may not be linked. You can check it by running `brew install autoconf-archive` again. If it's not linked, it will show a warning message like: +Arrow C++ must be installed to build Arrow GLib. Run `make install` on +Arrow C++ build directory. In addition, on linux, you may need to run +`sudo ldconfig`. -```console -% brew install autoconf-archive -Warning: autoconf-archive 2017.03.21 is already installed, it's just not linked. -You can use `brew link autoconf-archive` to link this version. -``` +### build failed - unable to load http://docbook.sourceforge.net/release/xsl/current/html/chunk.xsl -In this case, you need to run `brew link autoconf-archive`. It may fail with the following message if you have install conflicted packages (e.g. `gnome-common`). +You need to set the following environment variable on macOS: ```console -% brew link autoconf-archive -Linking /usr/local/Cellar/autoconf-archive/2017.03.21... -Error: Could not symlink share/aclocal/ax_check_enable_debug.m4 -Target /usr/local/share/aclocal/ax_check_enable_debug.m4 -is a symlink belonging to gnome-common. You can unlink it: - brew unlink gnome-common +$ export XML_CATALOG_FILES="$(brew --prefix)/etc/xml/catalog" ``` -You need to run `brew unlink `, then run `brew link autoconf-archive` again. - -After installing/linking `autoconf-archive`, run `./autogen.sh` again. +### build failed - Symbol not found, referenced from `libsource-highlight.4.dylib` -### [macOS] configure failed - gobject-introspection-1.0 is not installed +You may get the following error on macOS: -gobject-introspection requires libffi, and it's automatically installed with gobject-introspection. However it can't be found because it's [keg-only](https://docs.brew.sh/FAQ.html#what-does-keg-only-mean). You need to set `PKG_CONFIG_PATH` when executing configure. -```console -% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig +```text +dyld: Symbol not found: __ZN5boost16re_detail_10650112perl_matcherIPKcNSt3__19allocatorINS_9sub_matchIS3_EEEENS_12regex_traitsIcNS_16cpp_regex_traitsIcEEEEE14construct_initERKNS_11basic_regexIcSC_EENS_15regex_constants12_match_flagsE + Referenced from: /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib + Expected in: flat namespace + in /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib ``` -### build failed - /usr/bin/ld: cannot find -larrow - -Arrow C++ must be installed to build Arrow GLib. Run `make install` on Arrow C++ build directory. In addition, on linux, you may need to run `sudo ldconfig`. - -### build failed - unable to load http://docbook.sourceforge.net/release/xsl/current/html/chunk.xsl - -On macOS you may need to set the following environment variable: +To fix this error, you need to upgrade `source-highlight`: ```console -% export XML_CATALOG_FILES="/usr/local/etc/xml/catalog" +$ brew upgrade source-highlight ``` -### build failed - Symbol not found, referenced from `libsource-highlight.4.dylib` +### test failed - Failed to load shared library '...' referenced by the typelib: dlopen(...): dependent dylib '@rpath/...' not found for '...'. relative file paths not allowed '@rpath/...' -On macOS if you see the following error you may need to upgrade `source-highlight` +You may get the following error on macOS by running test: -```console -dyld: Symbol not found: __ZN5boost16re_detail_10650112perl_matcherIPKcNSt3__19allocatorINS_9sub_matchIS3_EEEENS_12regex_traitsIcNS_16cpp_regex_traitsIcEEEEE14construct_initERKNS_11basic_regexIcSC_EENS_15regex_constants12_match_flagsE - Referenced from: /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib - Expected in: flat namespace - in /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib +```text +(NULL)-WARNING **: Failed to load shared library '/usr/local/lib/libparquet-glib.400.dylib' referenced by the typelib: dlopen(/usr/local/lib/libparquet-glib.400.dylib, 0x0009): dependent dylib '@rpath/libparquet.400.dylib' not found for '/usr/local/lib/libparquet-glib.400.dylib'. relative file paths not allowed '@rpath/libparquet.400.dylib' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:215:in `load_object_info' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:68:in `load_info' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:43:in `block in load' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:34:in `block (2 levels) in each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:33:in `times' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:33:in `block in each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:32:in `each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:32:in `each' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:42:in `load' + from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection.rb:44:in `load' + from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/run-test.rb:60:in `
' +Traceback (most recent call last): + 17: from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/run-test.rb:80:in `
' + 16: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:66:in `run' + 15: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:434:in `run' + 14: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:106:in `block in ' + 13: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:38:in `collect' + 12: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:136:in `add_load_path' + 11: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:43:in `block in collect' + 10: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:43:in `each' + 9: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:46:in `block (2 levels) in collect' + 8: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:85:in `collect_recursive' + 7: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:85:in `each' + 6: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:87:in `block in collect_recursive' + 5: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:112:in `collect_file' + 4: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:136:in `add_load_path' + 3: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:114:in `block in collect_file' + 2: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:114:in `require' + 1: from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/test-extension-data-type.rb:18:in `' +/Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/test-extension-data-type.rb:19:in `': uninitialized constant Arrow::ExtensionArray (NameError) ``` -To fix do: +You can't use `@rpath` in Arrow C++. To fix this error, you need to +build Arrow C++ with `-DARROW_INSTALL_NAME_RPATH=OFF`: ```console -% brew upgrade source-highlight +$ cmake -S cpp -B cpp.build -DARROW_INSTALL_NAME_RPATH=OFF ... +$ cmake --build cpp.build +$ sudo cmake --build cpp.build --target install ``` diff --git a/c_glib/arrow-cuda-glib/Makefile.am b/c_glib/arrow-cuda-glib/Makefile.am deleted file mode 100644 index bcf20bb549e..00000000000 --- a/c_glib/arrow-cuda-glib/Makefile.am +++ /dev/null @@ -1,130 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -if HAVE_ARROW_CUDA -lib_LTLIBRARIES = \ - libarrow-cuda-glib.la - -libarrow_cuda_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CUDA_CFLAGS) \ - $(GARROW_CXXFLAGS) - -libarrow_cuda_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libarrow_cuda_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(ARROW_CUDA_LIBS) - -libarrow_cuda_glib_la_headers = \ - arrow-cuda-glib.h \ - cuda.h - -libarrow_cuda_glib_la_sources = \ - cuda.cpp \ - $(libarrow_cuda_glib_la_headers) - -libarrow_cuda_glib_la_cpp_headers = \ - arrow-cuda-glib.hpp \ - cuda.hpp - -libarrow_cuda_glib_la_SOURCES = \ - $(libarrow_cuda_glib_la_sources) \ - $(libarrow_cuda_glib_la_cpp_headers) - -arrow_cuda_glib_includedir = \ - $(includedir)/arrow-cuda-glib -arrow_cuda_glib_include_HEADERS = \ - $(libarrow_cuda_glib_la_headers) \ - $(libarrow_cuda_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - arrow-cuda-glib.pc - -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_builddir)/../arrow-glib - -ArrowCUDA-1.0.gir: libarrow-cuda-glib.la -ArrowCUDA_1_0_gir_PACKAGES = \ - arrow-glib -ArrowCUDA_1_0_gir_EXPORT_PACKAGES = \ - arrow-cuda-glib -ArrowCUDA_1_0_gir_INCLUDES = \ - Arrow-1.0 -ArrowCUDA_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -ArrowCUDA_1_0_gir_LIBS = -ArrowCUDA_1_0_gir_FILES = \ - $(libarrow_cuda_glib_la_sources) -ArrowCUDA_1_0_gir_SCANNERFLAGS = \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --add-include-path=$(abs_builddir)/../arrow-glib \ - --identifier-prefix=GArrowCUDA \ - --symbol-prefix=garrow_cuda -if OS_MACOS -ArrowCUDA_1_0_gir_LIBS += \ - arrow-glib \ - arrow-cuda-glib -ArrowCUDA_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_builddir)/../arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -ArrowCUDA_1_0_gir_LIBS += \ - $(abs_builddir)/../arrow-glib/libarrow-glib.la \ - libarrow-cuda-glib.la -endif - -INTROSPECTION_GIRS += ArrowCUDA-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/arrow-dataset-glib/Makefile.am b/c_glib/arrow-dataset-glib/Makefile.am deleted file mode 100644 index bd2f9ffdfda..00000000000 --- a/c_glib/arrow-dataset-glib/Makefile.am +++ /dev/null @@ -1,133 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -if HAVE_ARROW_DATASET -lib_LTLIBRARIES = \ - libarrow-dataset-glib.la - -libarrow_dataset_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_DATASET_CFLAGS) \ - $(GARROW_CXXFLAGS) - -libarrow_dataset_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libarrow_dataset_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(ARROW_DATASET_LIBS) - -libarrow_dataset_glib_la_headers = \ - arrow-dataset-glib.h \ - file-format.h \ - scanner.h - -libarrow_dataset_glib_la_sources = \ - file-format.cpp \ - scanner.cpp \ - $(libarrow_dataset_glib_la_headers) - -libarrow_dataset_glib_la_cpp_headers = \ - arrow-dataset-glib.hpp \ - file-format.hpp \ - scanner.hpp - -libarrow_dataset_glib_la_SOURCES = \ - $(libarrow_dataset_glib_la_sources) \ - $(libarrow_dataset_glib_la_cpp_headers) - -arrow_dataset_glib_includedir = \ - $(includedir)/arrow-dataset-glib -arrow_dataset_glib_include_HEADERS = \ - $(libarrow_dataset_glib_la_headers) \ - $(libarrow_dataset_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - arrow-dataset-glib.pc - -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_builddir)/../arrow-glib - -ArrowDataset-1.0.gir: libarrow-dataset-glib.la -ArrowDataset_1_0_gir_PACKAGES = \ - arrow-glib -ArrowDataset_1_0_gir_EXPORT_PACKAGES = \ - arrow-dataset-glib -ArrowDataset_1_0_gir_INCLUDES = \ - Arrow-1.0 -ArrowDataset_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -ArrowDataset_1_0_gir_LIBS = -ArrowDataset_1_0_gir_FILES = \ - $(libarrow_dataset_glib_la_sources) -ArrowDataset_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_builddir)/../arrow-glib \ - --identifier-prefix=GAD \ - --library-path=$(ARROW_LIB_DIR) \ - --symbol-prefix=gad \ - --warn-all -if OS_MACOS -ArrowDataset_1_0_gir_LIBS += \ - arrow-glib \ - arrow-dataset-glib -ArrowDataset_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_builddir)/../arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -ArrowDataset_1_0_gir_LIBS += \ - $(abs_builddir)/../arrow-glib/libarrow-glib.la \ - libarrow-dataset-glib.la -endif - -INTROSPECTION_GIRS += ArrowDataset-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h index 4cb37ea5a99..ff160452845 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h @@ -22,4 +22,5 @@ #include #include +#include #include diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp index 438079eae10..c221825bc2a 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp @@ -22,4 +22,5 @@ #include #include +#include #include diff --git a/c_glib/arrow-dataset-glib/fragment.cpp b/c_glib/arrow-dataset-glib/fragment.cpp new file mode 100644 index 00000000000..e442bfabf00 --- /dev/null +++ b/c_glib/arrow-dataset-glib/fragment.cpp @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include + +G_BEGIN_DECLS + +/** + * SECTION: fragment + * @section_id: fragment + * @title: Fragment classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADFragment is a base class for all fragment classes. + * + * #GADInMemoryFragment is a class for in-memory fragment. + * + * Since: 4.0.0 + */ + +/* arrow::dataset::Fragment */ + +typedef struct GADFragmentPrivate_ { + std::shared_ptr fragment; +} GADFragmentPrivate; + +enum { + PROP_FRAGMENT = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADFragment, + gad_fragment, + G_TYPE_OBJECT) + +#define GAD_FRAGMENT_GET_PRIVATE(obj) \ + static_cast( \ + gad_fragment_get_instance_private( \ + GAD_FRAGMENT(obj))) + +static void +gad_fragment_finalize(GObject *object) +{ + auto priv = GAD_FRAGMENT_GET_PRIVATE(object); + + priv->fragment.~shared_ptr(); + + G_OBJECT_CLASS(gad_fragment_parent_class)->finalize(object); +} + +static void +gad_fragment_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAD_FRAGMENT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FRAGMENT: + priv->fragment = + *static_cast *>(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gad_fragment_init(GADFragment *object) +{ + auto priv = GAD_FRAGMENT_GET_PRIVATE(object); + new(&priv->fragment) std::shared_ptr; +} + +static void +gad_fragment_class_init(GADFragmentClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gad_fragment_finalize; + gobject_class->set_property = gad_fragment_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("fragment", + "Fragment", + "The raw std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FRAGMENT, spec); +} + +/* arrow::dataset::InMemoryFragment */ + +G_DEFINE_TYPE(GADInMemoryFragment, + gad_in_memory_fragment, + GAD_TYPE_FRAGMENT) + +static void +gad_in_memory_fragment_init(GADInMemoryFragment *object) +{ +} + +static void +gad_in_memory_fragment_class_init(GADInMemoryFragmentClass *klass) +{ +} + +/** + * gad_in_memory_fragment_new: + * @schema: A #GArrowSchema. + * @record_batches: (array length=n_record_batches): + * (element-type GArrowRecordBatch): The record batches of the table. + * @n_record_batches: The number of record batches. + * + * Returns: A newly created #GADInMemoryFragment. + * + * Since: 4.0.0 + */ +GADInMemoryFragment * +gad_in_memory_fragment_new(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_record_batches; + arrow_record_batches.reserve(n_record_batches); + for (gsize i = 0; i < n_record_batches; ++i) { + auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); + arrow_record_batches.push_back(arrow_record_batch); + } + auto arrow_in_memory_fragment = + std::make_shared(arrow_schema, + arrow_record_batches); + return gad_in_memory_fragment_new_raw(&arrow_in_memory_fragment); +} + +G_END_DECLS + +GADFragment * +gad_fragment_new_raw(std::shared_ptr *arrow_fragment) +{ + auto fragment = + GAD_FRAGMENT(g_object_new(GAD_TYPE_FRAGMENT, + "fragment", arrow_fragment, + NULL)); + return fragment; +} + +std::shared_ptr +gad_fragment_get_raw(GADFragment *fragment) +{ + auto priv = GAD_FRAGMENT_GET_PRIVATE(fragment); + return priv->fragment; +} + +GADInMemoryFragment * +gad_in_memory_fragment_new_raw(std::shared_ptr *arrow_fragment) +{ + auto fragment = + GAD_IN_MEMORY_FRAGMENT(g_object_new(GAD_TYPE_IN_MEMORY_FRAGMENT, + "fragment", arrow_fragment, + NULL)); + return fragment; +} diff --git a/c_glib/arrow-dataset-glib/fragment.h b/c_glib/arrow-dataset-glib/fragment.h new file mode 100644 index 00000000000..c0ee8769db1 --- /dev/null +++ b/c_glib/arrow-dataset-glib/fragment.h @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +/* arrow::dataset::Fragment */ + +#define GAD_TYPE_FRAGMENT (gad_fragment_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADFragment, + gad_fragment, + GAD, + FRAGMENT, + GObject) +struct _GADFragmentClass +{ + GObjectClass parent_class; +}; + +/* arrow::dataset::InMemoryFragment */ + +#define GAD_TYPE_IN_MEMORY_FRAGMENT (gad_in_memory_fragment_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADInMemoryFragment, + gad_in_memory_fragment, + GAD, + IN_MEMORY_FRAGMENT, + GADFragment) +struct _GADInMemoryFragmentClass +{ + GADFragmentClass parent_class; +}; + +GARROW_AVAILABLE_IN_4_0 +GADInMemoryFragment * +gad_in_memory_fragment_new(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches); + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/fragment.hpp b/c_glib/arrow-dataset-glib/fragment.hpp new file mode 100644 index 00000000000..441b7c99cb8 --- /dev/null +++ b/c_glib/arrow-dataset-glib/fragment.hpp @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +std::shared_ptr +gad_fragment_get_raw(GADFragment *fragment); + +GADFragment* +gad_fragment_new_raw(std::shared_ptr *arrow_fragment); + +GADInMemoryFragment* +gad_in_memory_fragment_new_raw(std::shared_ptr *arrow_fragment); diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 49a17382252..83b57504f81 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -19,18 +19,21 @@ sources = files( 'file-format.cpp', + 'fragment.cpp', 'scanner.cpp', ) c_headers = files( 'arrow-dataset-glib.h', 'file-format.h', + 'fragment.h', 'scanner.h', ) cpp_headers = files( 'arrow-dataset-glib.hpp', 'file-format.hpp', + 'fragment.hpp', 'scanner.hpp', ) diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 68c3e98ea08..4256dece2f8 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -27,6 +27,7 @@ #include #include +#include #include G_BEGIN_DECLS @@ -37,8 +38,6 @@ G_BEGIN_DECLS * @title: Scanner classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * - * #GADScanContext is a class for a scan context. - * * #GADScanOptions is a class for a set of scan options. * * #GADScanTask is an abstract class for a scan task. @@ -48,131 +47,6 @@ G_BEGIN_DECLS * Since: 1.0.0 */ -/* arrow::dataset::ScanContext */ - -typedef struct GADScanContextPrivate_ { - std::shared_ptr scan_context; -} GADScanContextPrivate; - -enum { - PROP_SCAN_CONTEXT = 1, - PROP_USE_THREADS, -}; - -G_DEFINE_TYPE_WITH_PRIVATE(GADScanContext, - gad_scan_context, - G_TYPE_OBJECT) - -#define GAD_SCAN_CONTEXT_GET_PRIVATE(obj) \ - static_cast( \ - gad_scan_context_get_instance_private( \ - GAD_SCAN_CONTEXT(obj))) - -static void -gad_scan_context_finalize(GObject *object) -{ - auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object); - - priv->scan_context.~shared_ptr(); - - G_OBJECT_CLASS(gad_scan_context_parent_class)->finalize(object); -} - -static void -gad_scan_context_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_SCAN_CONTEXT: - priv->scan_context = - *static_cast *>(g_value_get_pointer(value)); - break; - case PROP_USE_THREADS: - priv->scan_context->use_threads = g_value_get_boolean(value); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -gad_scan_context_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_USE_THREADS: - g_value_set_boolean(value, priv->scan_context->use_threads); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -gad_scan_context_init(GADScanContext *object) -{ - auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object); - new(&priv->scan_context) std::shared_ptr; -} - -static void -gad_scan_context_class_init(GADScanContextClass *klass) -{ - auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = gad_scan_context_finalize; - gobject_class->set_property = gad_scan_context_set_property; - gobject_class->get_property = gad_scan_context_get_property; - - auto scan_context = arrow::dataset::ScanContext(); - - GParamSpec *spec; - spec = g_param_spec_pointer("scan-context", - "ScanContext", - "The raw std::shared *", - static_cast(G_PARAM_WRITABLE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_SCAN_CONTEXT, spec); - - /** - * GADScanContext:use-threads: - * - * Indicate if the Scanner should make use of a ThreadPool. - * - * Since: 1.0.0 - */ - spec = g_param_spec_boolean("use-threads", - "Use threads", - "Indicate if the Scanner should make use of a ThreadPool", - scan_context.use_threads, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec); -} - -/** - * gad_scan_context_new: - * - * Returns: A newly created #GADScanContext. - * - * Since: 1.0.0 - */ -GADScanContext * -gad_scan_context_new(void) -{ - auto arrow_scan_context = std::make_shared(); - return gad_scan_context_new_raw(&arrow_scan_context); -} - /* arrow::dataset::ScanOptions */ typedef struct GADScanOptionsPrivate_ { @@ -185,6 +59,7 @@ enum { PROP_EVALUATOR, PROP_PROJECTOR, PROP_BATCH_SIZE, + PROP_USE_THREADS, }; G_DEFINE_TYPE_WITH_PRIVATE(GADScanOptions, @@ -222,6 +97,9 @@ gad_scan_options_set_property(GObject *object, case PROP_BATCH_SIZE: priv->scan_options->batch_size = g_value_get_int64(value); break; + case PROP_USE_THREADS: + priv->scan_options->use_threads = g_value_get_boolean(value); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -240,6 +118,9 @@ gad_scan_options_get_property(GObject *object, case PROP_BATCH_SIZE: g_value_set_int64(value, priv->scan_options->batch_size); break; + case PROP_USE_THREADS: + g_value_set_boolean(value, priv->scan_options->use_threads); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -265,7 +146,7 @@ gad_scan_options_class_init(GADScanOptionsClass *klass) gobject_class->set_property = gad_scan_options_set_property; gobject_class->get_property = gad_scan_options_get_property; - auto scan_options = arrow::dataset::ScanOptions::Make(arrow::schema({})); + auto scan_options = std::make_shared(); spec = g_param_spec_pointer("scan-options", "ScanOptions", @@ -293,6 +174,20 @@ gad_scan_options_class_init(GADScanOptionsClass *klass) scan_options->batch_size, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, PROP_BATCH_SIZE, spec); + + /** + * GADScanOptions:use-threads: + * + * Indicate if the Scanner should make use of a ThreadPool. + * + * Since: 4.0.0 + */ + spec = g_param_spec_boolean("use-threads", + "Use threads", + "Indicate if the Scanner should make use of a ThreadPool", + scan_options->use_threads, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec); } /** @@ -307,7 +202,8 @@ GADScanOptions * gad_scan_options_new(GArrowSchema *schema) { auto arrow_schema = garrow_schema_get_raw(schema); - auto arrow_scan_options = arrow::dataset::ScanOptions::Make(arrow_schema); + auto arrow_scan_options = std::make_shared(); + arrow_scan_options->dataset_schema = arrow_schema; return gad_scan_options_new_raw(&arrow_scan_options); } @@ -323,42 +219,22 @@ GArrowSchema * gad_scan_options_get_schema(GADScanOptions *scan_options) { auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(scan_options); - auto arrow_schema = priv->scan_options->schema(); + auto arrow_schema = priv->scan_options->dataset_schema; return garrow_schema_new_raw(&arrow_schema); } -/** - * gad_scan_options_replace_schema: - * @scan_options: A #GADScanOptions. - * @schema: A #GArrowSchema. - * - * Returns: (transfer full): - * A copy of the #GADScanOptions with the given #GArrowSchema. - * - * Since: 1.0.0 - */ -GADScanOptions * -gad_scan_options_replace_schema(GADScanOptions *scan_options, - GArrowSchema *schema) -{ - auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(scan_options); - auto arrow_schema = garrow_schema_get_raw(schema); - auto arrow_scan_options_copy = priv->scan_options->ReplaceSchema(arrow_schema); - return gad_scan_options_new_raw(&arrow_scan_options_copy); -} - /* arrow::dataset::ScanTask */ typedef struct GADScanTaskPrivate_ { std::shared_ptr scan_task; GADScanOptions *options; - GADScanContext *context; + GADFragment *fragment; } GADScanTaskPrivate; enum { PROP_SCAN_TASK = 1, PROP_OPTIONS, - PROP_CONTEXT, + PROP_FRAGMENT, }; G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADScanTask, @@ -380,9 +256,9 @@ gad_scan_task_dispose(GObject *object) priv->options = NULL; } - if (priv->context) { - g_object_unref(priv->context); - priv->context = NULL; + if (priv->fragment) { + g_object_unref(priv->fragment); + priv->fragment = NULL; } G_OBJECT_CLASS(gad_scan_task_parent_class)->dispose(object); @@ -414,8 +290,8 @@ gad_scan_task_set_property(GObject *object, case PROP_OPTIONS: priv->options = GAD_SCAN_OPTIONS(g_value_dup_object(value)); break; - case PROP_CONTEXT: - priv->context = GAD_SCAN_CONTEXT(g_value_dup_object(value)); + case PROP_FRAGMENT: + priv->fragment = GAD_FRAGMENT(g_value_dup_object(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -435,8 +311,8 @@ gad_scan_task_get_property(GObject *object, case PROP_OPTIONS: g_value_set_object(value, priv->options); break; - case PROP_CONTEXT: - g_value_set_object(value, priv->context); + case PROP_FRAGMENT: + g_value_set_object(value, priv->fragment); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -485,19 +361,19 @@ gad_scan_task_class_init(GADScanTaskClass *klass) g_object_class_install_property(gobject_class, PROP_OPTIONS, spec); /** - * GADScanTask:context: + * GADScanTask:fragment: * - * The context of the scan task. + * The fragment of the scan task. * - * Since: 1.0.0 + * Since: 4.0.0 */ - spec = g_param_spec_object("context", - "Context", - "The context of the scan task", - GAD_TYPE_SCAN_CONTEXT, + spec = g_param_spec_object("fragment", + "Fragment", + "The fragment of the scan task", + GAD_TYPE_FRAGMENT, static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_CONTEXT, spec); + g_object_class_install_property(gobject_class, PROP_FRAGMENT, spec); } /** @@ -522,24 +398,24 @@ gad_scan_task_get_options(GADScanTask *scan_task) } /** - * gad_scan_task_get_context: - * @scan_task: A #GADScanTask. + * gad_scan_task_get_fragment: + * @scan_task: A #GADFragment. * - * Returns: (transfer full): A #GADScanContext. + * Returns: (transfer full): A #GADFragment. * - * Since: 1.0.0 + * Since: 4.0.0 */ -GADScanContext * -gad_scan_task_get_context(GADScanTask *scan_task) +GADFragment * +gad_scan_task_get_fragment(GADScanTask *scan_task) { auto priv = GAD_SCAN_TASK_GET_PRIVATE(scan_task); - if (priv->context) { - g_object_ref(priv->context); - return priv->context; + if (priv->fragment) { + g_object_ref(priv->fragment); + return priv->fragment; } - auto arrow_context = priv->scan_task->context(); - return gad_scan_context_new_raw(&arrow_context); + auto arrow_fragment = priv->scan_task->fragment(); + return gad_fragment_new_raw(&arrow_fragment); } /** @@ -587,7 +463,7 @@ gad_in_memory_scan_task_class_init(GADInMemoryScanTaskClass *klass) * (element-type GArrowRecordBatch): The record batches of the table. * @n_record_batches: The number of record batches. * @options: A #GADScanOptions. - * @context: A #GADScanContext. + * @fragment: A #GADInMemoryFragment. * * Returns: A newly created #GADInMemoryScanTask. * @@ -597,7 +473,7 @@ GADInMemoryScanTask * gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches, gsize n_record_batches, GADScanOptions *options, - GADScanContext *context) + GADInMemoryFragment *fragment) { std::vector> arrow_record_batches; arrow_record_batches.reserve(n_record_batches); @@ -606,35 +482,18 @@ gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches, arrow_record_batches.push_back(arrow_record_batch); } auto arrow_options = gad_scan_options_get_raw(options); - auto arrow_context = gad_scan_context_get_raw(context); + auto arrow_fragment = gad_fragment_get_raw(GAD_FRAGMENT(fragment)); auto arrow_in_memory_scan_task = std::make_shared(arrow_record_batches, arrow_options, - arrow_context); + arrow_fragment); return gad_in_memory_scan_task_new_raw(&arrow_in_memory_scan_task, options, - context); + fragment); } G_END_DECLS -GADScanContext * -gad_scan_context_new_raw(std::shared_ptr *arrow_scan_context) -{ - auto scan_context = - GAD_SCAN_CONTEXT(g_object_new(GAD_TYPE_SCAN_CONTEXT, - "scan-context", arrow_scan_context, - NULL)); - return scan_context; -} - -std::shared_ptr -gad_scan_context_get_raw(GADScanContext *scan_context) -{ - auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(scan_context); - return priv->scan_context; -} - GADScanOptions * gad_scan_options_new_raw(std::shared_ptr *arrow_scan_options) { @@ -655,13 +514,13 @@ gad_scan_options_get_raw(GADScanOptions *scan_options) GADInMemoryScanTask * gad_in_memory_scan_task_new_raw(std::shared_ptr *arrow_in_memory_scan_task, GADScanOptions *options, - GADScanContext *context) + GADInMemoryFragment *fragment) { auto in_memory_scan_task = GAD_IN_MEMORY_SCAN_TASK(g_object_new(GAD_TYPE_IN_MEMORY_SCAN_TASK, "scan-task", arrow_in_memory_scan_task, "options", options, - "context", context, + "fragment", fragment, NULL)); return in_memory_scan_task; } diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index 75d212b1808..f387e8948f2 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -21,23 +21,9 @@ #include -G_BEGIN_DECLS - -/* arrow::dataset::ScanContext */ - -#define GAD_TYPE_SCAN_CONTEXT (gad_scan_context_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADScanContext, - gad_scan_context, - GAD, - SCAN_CONTEXT, - GObject) -struct _GADScanContextClass -{ - GObjectClass parent_class; -}; +#include -GARROW_AVAILABLE_IN_1_0 -GADScanContext *gad_scan_context_new(void); +G_BEGIN_DECLS /* arrow::dataset::ScanOptions */ @@ -57,9 +43,6 @@ GARROW_AVAILABLE_IN_1_0 GADScanOptions *gad_scan_options_new(GArrowSchema *schema); GARROW_AVAILABLE_IN_1_0 GArrowSchema *gad_scan_options_get_schema(GADScanOptions *scan_options); -GARROW_AVAILABLE_IN_1_0 -GADScanOptions *gad_scan_options_replace_schema(GADScanOptions *scan_options, - GArrowSchema *schema); /* arrow::dataset::ScanTask */ @@ -76,8 +59,8 @@ struct _GADScanTaskClass GARROW_AVAILABLE_IN_1_0 GADScanOptions *gad_scan_task_get_options(GADScanTask *scan_task); -GARROW_AVAILABLE_IN_1_0 -GADScanContext *gad_scan_task_get_context(GADScanTask *scan_task); +GARROW_AVAILABLE_IN_4_0 +GADFragment *gad_scan_task_get_fragment(GADScanTask *scan_task); GARROW_AVAILABLE_IN_1_0 GArrowRecordBatchIterator *gad_scan_task_execute(GADScanTask *scan_task, GError **error); @@ -100,6 +83,6 @@ GADInMemoryScanTask * gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches, gsize n_record_batches, GADScanOptions *options, - GADScanContext *context); + GADInMemoryFragment *fragment); G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/scanner.hpp b/c_glib/arrow-dataset-glib/scanner.hpp index 7952a5813e5..f10351ee99b 100644 --- a/c_glib/arrow-dataset-glib/scanner.hpp +++ b/c_glib/arrow-dataset-glib/scanner.hpp @@ -21,13 +21,9 @@ #include +#include #include -GADScanContext * -gad_scan_context_new_raw(std::shared_ptr *arrow_scan_context); -std::shared_ptr -gad_scan_context_get_raw(GADScanContext *scan_context); - GADScanOptions * gad_scan_options_new_raw(std::shared_ptr *arrow_scan_options); std::shared_ptr @@ -36,4 +32,4 @@ gad_scan_options_get_raw(GADScanOptions *scan_options); GADInMemoryScanTask * gad_in_memory_scan_task_new_raw(std::shared_ptr *arrow_in_memory_scan_task, GADScanOptions *scan_options, - GADScanContext *scan_context); + GADInMemoryFragment *fragment); diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am deleted file mode 100644 index 9f19578d537..00000000000 --- a/c_glib/arrow-glib/Makefile.am +++ /dev/null @@ -1,309 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) - -# libarrow-glib -lib_LTLIBRARIES = \ - libarrow-glib.la - -libarrow_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(GARROW_CXXFLAGS) - -libarrow_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libarrow_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - $(ARROW_LIBS) - -libarrow_glib_la_headers = \ - array.h \ - array-builder.h \ - arrow-glib.h \ - basic-array.h \ - basic-data-type.h \ - buffer.h \ - chunked-array.h \ - codec.h \ - composite-array.h \ - composite-data-type.h \ - data-type.h \ - datum.h \ - decimal.h \ - error.h \ - field.h \ - gobject-type.h \ - record-batch.h \ - schema.h \ - table.h \ - table-builder.h \ - tensor.h \ - type.h - -libarrow_glib_la_headers += \ - file.h \ - file-mode.h \ - input-stream.h \ - output-stream.h \ - readable.h \ - writable.h \ - writable-file.h - -libarrow_glib_la_headers += \ - ipc-options.h \ - metadata-version.h \ - reader.h \ - writer.h - -libarrow_glib_la_headers += \ - compute.h - -libarrow_glib_la_headers += \ - file-system.h \ - local-file-system.h - -if HAVE_ARROW_ORC -libarrow_glib_la_headers += \ - orc-file-reader.h -endif - -libarrow_glib_la_generated_headers = \ - enums.h \ - version.h - -libarrow_glib_la_generated_sources = \ - enums.c \ - $(libarrow_glib_la_generated_headers) - -libarrow_glib_la_sources = \ - array-builder.cpp \ - basic-array.cpp \ - basic-data-type.cpp \ - buffer.cpp \ - chunked-array.cpp \ - codec.cpp \ - composite-array.cpp \ - composite-data-type.cpp \ - datum.cpp \ - decimal.cpp \ - error.cpp \ - field.cpp \ - record-batch.cpp \ - schema.cpp \ - table.cpp \ - table-builder.cpp \ - tensor.cpp \ - type.cpp \ - $(libarrow_glib_la_headers) \ - $(libarrow_glib_la_generated_sources) - -libarrow_glib_la_sources += \ - file.cpp \ - file-mode.cpp \ - input-stream.cpp \ - output-stream.cpp \ - readable.cpp \ - writable.cpp \ - writable-file.cpp - -libarrow_glib_la_sources += \ - ipc-options.cpp \ - metadata-version.cpp \ - reader.cpp \ - writer.cpp - -libarrow_glib_la_sources += \ - compute.cpp - -libarrow_glib_la_sources += \ - file-system.cpp \ - local-file-system.cpp - -if HAVE_ARROW_ORC -libarrow_glib_la_sources += \ - orc-file-reader.cpp -endif - -libarrow_glib_la_cpp_headers = \ - array.hpp \ - array-builder.hpp \ - arrow-glib.hpp \ - basic-array.hpp \ - basic-data-type.hpp \ - buffer.hpp \ - chunked-array.hpp \ - codec.hpp \ - data-type.hpp \ - datum.hpp \ - decimal.hpp \ - error.hpp \ - field.hpp \ - record-batch.hpp \ - schema.hpp \ - table.hpp \ - table-builder.hpp \ - tensor.hpp \ - type.hpp - -libarrow_glib_la_cpp_headers += \ - file.hpp \ - file-mode.hpp \ - input-stream.hpp \ - output-stream.hpp \ - readable.hpp \ - writable.hpp \ - writable-file.hpp - -libarrow_glib_la_cpp_headers += \ - ipc-options.hpp \ - metadata-version.hpp \ - reader.hpp \ - writer.hpp - -libarrow_glib_la_cpp_headers += \ - compute.hpp - -libarrow_glib_la_cpp_headers += \ - file-system.hpp \ - local-file-system.hpp - -if HAVE_ARROW_ORC -libarrow_glib_la_cpp_headers += \ - orc-file-reader.hpp -endif - -libarrow_glib_la_cpp_internal_headers = \ - internal-hash-table.hpp \ - internal-index.hpp - -libarrow_glib_la_SOURCES = \ - $(libarrow_glib_la_sources) \ - $(libarrow_glib_la_cpp_headers) \ - $(libarrow_glib_la_cpp_internal_headers) - -BUILT_SOURCES = \ - $(libarrow_glib_la_generated_headers) \ - $(libarrow_glib_la_generated_sources) \ - stamp-enums.c \ - stamp-enums.h - -DISTCLEANFILES += \ - stamp-enums.c \ - stamp-enums.h - -EXTRA_DIST += \ - enums.c.template \ - enums.h.template - -enums.h: stamp-enums.h - @true -stamp-enums.h: $(libarrow_glib_la_headers) enums.h.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GArrow \ - --symbol-prefix garrow \ - --template enums.h.template \ - $(libarrow_glib_la_headers)) > enums.h - touch $@ - -enums.c: stamp-enums.c - @true -stamp-enums.c: $(libarrow_glib_la_headers) enums.c.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GArrow \ - --symbol-prefix garrow \ - --template enums.c.template \ - $(libarrow_glib_la_headers)) > enums.c - touch $@ - -arrow_glib_includedir = $(includedir)/arrow-glib -arrow_glib_include_HEADERS = \ - $(libarrow_glib_la_headers) \ - $(libarrow_glib_la_cpp_headers) \ - $(libarrow_glib_la_generated_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - arrow-glib.pc - -if HAVE_ARROW_ORC -pkgconfig_DATA += \ - arrow-orc-glib.pc -endif - -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -INTROSPECTION_COMPILER_ARGS = - -Arrow-1.0.gir: libarrow-glib.la -Arrow_1_0_gir_PACKAGES = \ - gio-2.0 -Arrow_1_0_gir_EXPORT_PACKAGES = \ - arrow-glib -Arrow_1_0_gir_INCLUDES = \ - Gio-2.0 -Arrow_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Arrow_1_0_gir_LIBS = -Arrow_1_0_gir_FILES = $(libarrow_glib_la_sources) -Arrow_1_0_gir_SCANNERFLAGS = \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GArrow \ - --symbol-prefix=garrow -if OS_MACOS -Arrow_1_0_gir_LIBS += arrow-glib -Arrow_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_builddir)/.libs -else -Arrow_1_0_gir_LIBS += libarrow-glib.la -endif -INTROSPECTION_GIRS += Arrow-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif diff --git a/c_glib/arrow-glib/arrow-glib.pc.in b/c_glib/arrow-glib/arrow-glib.pc.in deleted file mode 100644 index f9f27b24990..00000000000 --- a/c_glib/arrow-glib/arrow-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Arrow GLib -Description: C API for Apache Arrow based on GLib -Version: @VERSION@ -Libs: -L${libdir} -larrow-glib -Cflags: -I${includedir} -Requires: gobject-2.0 arrow diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index eb1cd6aa168..83c35f91415 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -685,6 +685,42 @@ garrow_array_diff_unified(GArrowArray *array, GArrowArray *other_array) } } +/** + * garrow_array_concatenate: + * @array: A #GArrowArray. + * @other_arrays: (element-type GArrowArray): A #GArrowArray to be + * concatenated. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): The concatenated array. + * + * Since: 4.0.0 + */ +GArrowArray * +garrow_array_concatenate(GArrowArray *array, + GList *other_arrays, + GError **error) +{ + if (!other_arrays) { + g_object_ref(array); + return array; + } + arrow::ArrayVector arrow_arrays; + arrow_arrays.push_back(garrow_array_get_raw(array)); + for (auto node = other_arrays; node; node = node->next) { + auto other_array = GARROW_ARRAY(node->data); + arrow_arrays.push_back(garrow_array_get_raw(other_array)); + } + auto arrow_concatenated_array = arrow::Concatenate(arrow_arrays); + if (garrow::check(error, + arrow_concatenated_array, + "[array][concatenate]")) { + return garrow_array_new_raw(&(*arrow_concatenated_array)); + } else { + return NULL; + } +} + G_DEFINE_TYPE(GArrowNullArray, garrow_null_array, diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index 2bf93052013..9835db5e67a 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -67,6 +67,10 @@ GArrowArray *garrow_array_view(GArrowArray *array, GARROW_AVAILABLE_IN_0_15 gchar *garrow_array_diff_unified(GArrowArray *array, GArrowArray *other_array); +GARROW_AVAILABLE_IN_4_0 +GArrowArray *garrow_array_concatenate(GArrowArray *array, + GList *other_arrays, + GError **error); #define GARROW_TYPE_NULL_ARRAY (garrow_null_array_get_type()) diff --git a/c_glib/arrow-glib/chunked-array.cpp b/c_glib/arrow-glib/chunked-array.cpp index d2ba8ffbb79..bdb8976f35b 100644 --- a/c_glib/arrow-glib/chunked-array.cpp +++ b/c_glib/arrow-glib/chunked-array.cpp @@ -330,6 +330,30 @@ garrow_chunked_array_to_string(GArrowChunkedArray *chunked_array, GError **error return g_strdup(arrow_chunked_array->ToString().c_str()); } +/** + * garrow_chunked_array_combine: + * @chunked_array: A #GArrowChunkedArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): The combined array that has + * all data in all chunks. + * + * Since: 4.0.0 + */ +GArrowArray * +garrow_chunked_array_combine(GArrowChunkedArray *chunked_array, GError **error) +{ + const auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + auto arrow_combined_array = arrow::Concatenate(arrow_chunked_array->chunks()); + if (garrow::check(error, + arrow_combined_array, + "[chunked-array][combine]")) { + return garrow_array_new_raw(&(*arrow_combined_array)); + } else { + return NULL; + } +} + G_END_DECLS GArrowChunkedArray * diff --git a/c_glib/arrow-glib/chunked-array.h b/c_glib/arrow-glib/chunked-array.h index 1f7347f2b0f..8e721f0bf99 100644 --- a/c_glib/arrow-glib/chunked-array.h +++ b/c_glib/arrow-glib/chunked-array.h @@ -59,5 +59,8 @@ GArrowChunkedArray *garrow_chunked_array_slice(GArrowChunkedArray *chunked_array guint64 length); gchar *garrow_chunked_array_to_string(GArrowChunkedArray *chunked_array, GError **error); +GARROW_AVAILABLE_IN_4_0 +GArrowArray *garrow_chunked_array_combine(GArrowChunkedArray *chunked_array, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 17100e76a3c..636a3c74707 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -1479,10 +1479,12 @@ garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options, typedef struct GArrowCSVReaderPrivate_ { std::shared_ptr reader; + GArrowInputStream *input; } GArrowCSVReaderPrivate; enum { - PROP_CSV_TABLE_READER = 1 + PROP_CSV_TABLE_READER = 1, + PROP_CSV_READER_INPUT, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReader, @@ -1499,11 +1501,24 @@ garrow_csv_reader_dispose(GObject *object) { auto priv = GARROW_CSV_READER_GET_PRIVATE(object); - priv->reader = nullptr; + if (priv->input) { + g_object_unref(priv->input); + priv->input = nullptr; + } G_OBJECT_CLASS(garrow_csv_reader_parent_class)->dispose(object); } +static void +garrow_csv_reader_finalize(GObject *object) +{ + auto priv = GARROW_CSV_READER_GET_PRIVATE(object); + + priv->reader.~shared_ptr(); + + G_OBJECT_CLASS(garrow_csv_reader_parent_class)->finalize(object); +} + static void garrow_csv_reader_set_property(GObject *object, guint prop_id, @@ -1517,6 +1532,9 @@ garrow_csv_reader_set_property(GObject *object, priv->reader = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_CSV_READER_INPUT: + priv->input = GARROW_INPUT_STREAM(g_value_dup_object(value)); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1529,7 +1547,12 @@ garrow_csv_reader_get_property(GObject *object, GValue *value, GParamSpec *pspec) { + auto priv = GARROW_CSV_READER_GET_PRIVATE(object); + switch (prop_id) { + case PROP_CSV_READER_INPUT: + g_value_set_object(value, priv->input); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1539,25 +1562,37 @@ garrow_csv_reader_get_property(GObject *object, static void garrow_csv_reader_init(GArrowCSVReader *object) { + auto priv = GARROW_CSV_READER_GET_PRIVATE(object); + new(&priv->reader) std::shared_ptr; } static void garrow_csv_reader_class_init(GArrowCSVReaderClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->dispose = garrow_csv_reader_dispose; + gobject_class->finalize = garrow_csv_reader_finalize; gobject_class->set_property = garrow_csv_reader_set_property; gobject_class->get_property = garrow_csv_reader_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("csv-table-reader", "CSV table reader", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_CSV_TABLE_READER, spec); + + spec = g_param_spec_object("input", + "Input", + "The input stream to be read", + GARROW_TYPE_INPUT_STREAM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_CSV_READER_INPUT, + spec); } /** @@ -1591,14 +1626,13 @@ garrow_csv_reader_new(GArrowInputStream *input, } auto arrow_reader = - arrow::csv::TableReader::Make(arrow::default_memory_pool(), - arrow::io::AsyncContext(), + arrow::csv::TableReader::Make(arrow::io::default_io_context(), arrow_input, read_options, parse_options, convert_options); if (garrow::check(error, arrow_reader, "[csv-reader][new]")) { - return garrow_csv_reader_new_raw(&(arrow_reader.ValueOrDie())); + return garrow_csv_reader_new_raw(&(*arrow_reader), input); } else { return NULL; } @@ -1634,11 +1668,11 @@ typedef struct GArrowJSONReadOptionsPrivate_ { } GArrowJSONReadOptionsPrivate; enum { - PROP_JSON_READER_USE_THREADS = 1, - PROP_JSON_READER_BLOCK_SIZE, - PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES, - PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR, - PROP_JSON_READER_SCHEMA + PROP_JSON_READ_OPTIONS_USE_THREADS = 1, + PROP_JSON_READ_OPTIONS_BLOCK_SIZE, + PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES, + PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR, + PROP_JSON_READ_OPTIONS_SCHEMA, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowJSONReadOptions, @@ -1672,20 +1706,20 @@ garrow_json_read_options_set_property(GObject *object, auto priv = GARROW_JSON_READ_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_JSON_READER_USE_THREADS: + case PROP_JSON_READ_OPTIONS_USE_THREADS: priv->read_options.use_threads = g_value_get_boolean(value); break; - case PROP_JSON_READER_BLOCK_SIZE: + case PROP_JSON_READ_OPTIONS_BLOCK_SIZE: priv->read_options.block_size = g_value_get_int(value); break; - case PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES: + case PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES: priv->parse_options.newlines_in_values = g_value_get_boolean(value); break; - case PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR: + case PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR: priv->parse_options.unexpected_field_behavior = static_cast(g_value_get_enum(value)); break; - case PROP_JSON_READER_SCHEMA: + case PROP_JSON_READ_OPTIONS_SCHEMA: { auto schema = g_value_dup_object(value); if (priv->schema) { @@ -1715,19 +1749,19 @@ garrow_json_read_options_get_property(GObject *object, auto priv = GARROW_JSON_READ_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_JSON_READER_USE_THREADS: + case PROP_JSON_READ_OPTIONS_USE_THREADS: g_value_set_boolean(value, priv->read_options.use_threads); break; - case PROP_JSON_READER_BLOCK_SIZE: + case PROP_JSON_READ_OPTIONS_BLOCK_SIZE: g_value_set_int(value, priv->read_options.block_size); break; - case PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES: + case PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES: g_value_set_boolean(value, priv->parse_options.newlines_in_values); break; - case PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR: + case PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR: g_value_set_enum(value, static_cast(priv->parse_options.unexpected_field_behavior)); break; - case PROP_JSON_READER_SCHEMA: + case PROP_JSON_READ_OPTIONS_SCHEMA: g_value_set_object(value, priv->schema); break; default: @@ -1770,7 +1804,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) read_options.use_threads, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_USE_THREADS, + PROP_JSON_READ_OPTIONS_USE_THREADS, spec); /** @@ -1791,7 +1825,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) read_options.block_size, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_BLOCK_SIZE, + PROP_JSON_READ_OPTIONS_BLOCK_SIZE, spec); @@ -1813,7 +1847,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) parse_options.newlines_in_values, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES, + PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES, spec); /** @@ -1830,7 +1864,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) GARROW_JSON_READ_INFER_TYPE, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR, + PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR, spec); /** @@ -1846,7 +1880,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass) GARROW_TYPE_SCHEMA, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_JSON_READER_SCHEMA, + PROP_JSON_READ_OPTIONS_SCHEMA, spec); } @@ -1867,10 +1901,12 @@ garrow_json_read_options_new(void) typedef struct GArrowJSONReaderPrivate_ { std::shared_ptr reader; + GArrowInputStream *input; } GArrowJSONReaderPrivate; enum { - PROP_JSON_TABLE_READER = 1 + PROP_JSON_TABLE_READER = 1, + PROP_JSON_READER_INPUT, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowJSONReader, @@ -1887,11 +1923,24 @@ garrow_json_reader_dispose(GObject *object) { auto priv = GARROW_JSON_READER_GET_PRIVATE(object); - priv->reader = nullptr; + if (priv->input) { + g_object_unref(priv->input); + priv->input = nullptr; + } G_OBJECT_CLASS(garrow_json_reader_parent_class)->dispose(object); } +static void +garrow_json_reader_finalize(GObject *object) +{ + auto priv = GARROW_JSON_READER_GET_PRIVATE(object); + + priv->reader.~shared_ptr(); + + G_OBJECT_CLASS(garrow_json_reader_parent_class)->finalize(object); +} + static void garrow_json_reader_set_property(GObject *object, guint prop_id, @@ -1905,6 +1954,9 @@ garrow_json_reader_set_property(GObject *object, priv->reader = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_JSON_READER_INPUT: + priv->input = GARROW_INPUT_STREAM(g_value_dup_object(value)); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1917,7 +1969,12 @@ garrow_json_reader_get_property(GObject *object, GValue *value, GParamSpec *pspec) { + auto priv = GARROW_JSON_READER_GET_PRIVATE(object); + switch (prop_id) { + case PROP_JSON_READER_INPUT: + g_value_set_object(value, priv->input); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -1927,25 +1984,37 @@ garrow_json_reader_get_property(GObject *object, static void garrow_json_reader_init(GArrowJSONReader *object) { + auto priv = GARROW_JSON_READER_GET_PRIVATE(object); + new(&priv->reader) std::shared_ptr; } static void garrow_json_reader_class_init(GArrowJSONReaderClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->dispose = garrow_json_reader_dispose; + gobject_class->finalize = garrow_json_reader_finalize; gobject_class->set_property = garrow_json_reader_set_property; gobject_class->get_property = garrow_json_reader_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("json-table-reader", "JSON table reader", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_JSON_TABLE_READER, spec); + + spec = g_param_spec_object("input", + "Input", + "The input stream to be read", + GARROW_TYPE_INPUT_STREAM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_JSON_READER_INPUT, + spec); } /** @@ -1981,7 +2050,7 @@ garrow_json_reader_new(GArrowInputStream *input, } if (garrow::check(error, arrow_reader, "[json-reader][new]")) { - return garrow_json_reader_new_raw(&(arrow_reader.ValueOrDie())); + return garrow_json_reader_new_raw(&*arrow_reader, input); } else { return NULL; } @@ -2086,10 +2155,12 @@ garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader) } GArrowCSVReader * -garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader) +garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input) { auto reader = GARROW_CSV_READER(g_object_new(GARROW_TYPE_CSV_READER, "csv-table-reader", arrow_reader, + "input", input, NULL)); return reader; } @@ -2102,10 +2173,12 @@ garrow_csv_reader_get_raw(GArrowCSVReader *reader) } GArrowJSONReader * -garrow_json_reader_new_raw(std::shared_ptr *arrow_reader) +garrow_json_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input) { auto reader = GARROW_JSON_READER(g_object_new(GARROW_TYPE_JSON_READER, "json-table-reader", arrow_reader, + "input", input, NULL)); return reader; } diff --git a/c_glib/arrow-glib/reader.hpp b/c_glib/arrow-glib/reader.hpp index c1df700fe13..c7b2b76f215 100644 --- a/c_glib/arrow-glib/reader.hpp +++ b/c_glib/arrow-glib/reader.hpp @@ -44,11 +44,13 @@ std::shared_ptr garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader); GArrowCSVReader * -garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader); +garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input); std::shared_ptr garrow_csv_reader_get_raw(GArrowCSVReader *reader); GArrowJSONReader * -garrow_json_reader_new_raw(std::shared_ptr *arrow_reader); +garrow_json_reader_new_raw(std::shared_ptr *arrow_reader, + GArrowInputStream *input); std::shared_ptr garrow_json_reader_get_raw(GArrowJSONReader *reader); diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index fd2d7edfd65..11fa121ad2f 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -102,12 +102,12 @@ garrow_table_concatenate(GArrowTable *table, GList *other_tables, GError **error); GARROW_AVAILABLE_IN_0_14 -GArrowTable* +GArrowTable * garrow_table_slice(GArrowTable *table, gint64 offset, gint64 length); GARROW_AVAILABLE_IN_0_16 -GArrowTable* +GArrowTable * garrow_table_combine_chunks(GArrowTable *table, GError **error); diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 4ad59afd1e3..5a74566fd4a 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_4_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 4.0.0 + */ +#define GARROW_VERSION_4_0 G_ENCODE_VERSION(4, 0) + /** * GARROW_VERSION_3_0: * @@ -146,6 +155,15 @@ */ #define GARROW_VERSION_0_17 G_ENCODE_VERSION(0, 17) +/** + * GARROW_VERSION_0_16: + * + * You can use this macro value for compile time API version check. + * + * Since: 0.16.0 + */ +#define GARROW_VERSION_0_16 G_ENCODE_VERSION(0, 16) + /** * GARROW_VERSION_0_15: * @@ -238,6 +256,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_4_0 +# define GARROW_DEPRECATED_IN_4_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_4_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_4_0 +# define GARROW_DEPRECATED_IN_4_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_4_0 +# define GARROW_AVAILABLE_IN_4_0 GARROW_UNAVAILABLE(4, 0) +#else +# define GARROW_AVAILABLE_IN_4_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_3_0 # define GARROW_DEPRECATED_IN_3_0 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_3_0_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/configure.ac b/c_glib/configure.ac deleted file mode 100644 index 58c75b45002..00000000000 --- a/c_glib/configure.ac +++ /dev/null @@ -1,346 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -AC_PREREQ(2.65) - -m4_define([arrow_glib_version], 4.0.0-SNAPSHOT) -AC_INIT([arrow-glib], - arrow_glib_version, - [https://issues.apache.org/jira/browse/ARROW], - [apache-arrow-glib]) -AC_CONFIG_AUX_DIR([config]) -AC_CONFIG_MACRO_DIR([m4]) - -AC_CONFIG_SRCDIR([arrow-glib/arrow-glib.h]) -AC_CONFIG_HEADERS([config.h]) - -AM_INIT_AUTOMAKE([1.13 foreign]) -AM_SILENT_RULES([yes]) - -GARROW_VERSION_MAJOR=$(echo "arrow_glib_version" | \ - sed -E -e 's/^([[0-9]]+)\..+$/\1/' | \ - tr -d '\n') -GARROW_VERSION_MINOR=$(echo "arrow_glib_version" | \ - sed -E -e 's/^[[0-9]]+\.([[0-9]]+)\..+$/\1/' | \ - tr -d '\n') -GARROW_VERSION_MICRO=$(echo "arrow_glib_version" | \ - sed -E -e 's/^[[0-9]]+\.[[0-9]]+\.([[0-9]]+).*$/\1/' | \ - tr -d '\n') -if echo "arrow_glib_version" | grep -- "-" > /dev/null; then - GARROW_VERSION_TAG=$(echo "arrow_glib_version" | \ - sed -E -e 's/^[[0-9]]+\.[[0-9]]+\.[[0-9]]+-(.+)$/\1/' | \ - tr -d '\n') -else - GARROW_VERSION_TAG= -fi -AC_SUBST(GARROW_VERSION_MAJOR) -AC_SUBST(GARROW_VERSION_MINOR) -AC_SUBST(GARROW_VERSION_MICRO) -AC_SUBST(GARROW_VERSION_TAG) - -GGANDIVA_VERSION_MAJOR=${GARROW_VERSION_MAJOR} -GGANDIVA_VERSION_MINOR=${GARROW_VERSION_MINOR} -GGANDIVA_VERSION_MICRO=${GARROW_VERSION_MICRO} -GGANDIVA_VERSION_TAG=${GARROW_VERSION_TAG} -AC_SUBST(GGANDIVA_VERSION_MAJOR) -AC_SUBST(GGANDIVA_VERSION_MINOR) -AC_SUBST(GGANDIVA_VERSION_MICRO) -AC_SUBST(GGANDIVA_VERSION_TAG) - -GPARQUET_VERSION_MAJOR=${GARROW_VERSION_MAJOR} -GPARQUET_VERSION_MINOR=${GARROW_VERSION_MINOR} -GPARQUET_VERSION_MICRO=${GARROW_VERSION_MICRO} -GPARQUET_VERSION_TAG=${GARROW_VERSION_TAG} -AC_SUBST(GPARQUET_VERSION_MAJOR) -AC_SUBST(GPARQUET_VERSION_MINOR) -AC_SUBST(GPARQUET_VERSION_MICRO) -AC_SUBST(GPARQUET_VERSION_TAG) - -AC_CANONICAL_HOST -AC_MSG_CHECKING([for macOS]) -case "$host_os" in -darwin*) - os_macos=yes - ;; -*) - os_macos=no - ;; -esac -AC_MSG_RESULT([$os_macos]) -AM_CONDITIONAL(OS_MACOS, test "$os_macos" = "yes") - -LT_INIT -LT_CURRENT=$(expr ${GARROW_VERSION_MAJOR} \* 100 + ${GARROW_VERSION_MINOR}) -LT_REVISION=${GARROW_VERSION_MICRO} -LT_AGE=0 -LT_VERSION_INFO="\$(LT_CURRENT):\$(LT_REVISION):\$(LT_AGE)" -AC_SUBST(LT_CURRENT) -AC_SUBST(LT_REVISION) -AC_SUBST(LT_AGE) -AC_SUBST(LT_VERSION_INFO) - -AC_PROG_CC -AC_PROG_CXX -AX_CXX_COMPILE_STDCXX_11([ext], [mandatory]) - -GARROW_CFLAGS="-Wall" -GARROW_CXXFLAGS="-Wall" -AC_ARG_ENABLE(debug, - [AS_HELP_STRING([--enable-debug], - [Use debug flags (default=no)])], - [GARROW_DEBUG="$enableval"], - [GARROW_DEBUG="no"]) -if test "x$GARROW_DEBUG" != "xno"; then - GARROW_DEBUG="yes" - if test "$CLANG" = "yes"; then - CFLAGS="$CFLAGS -O0 -g" - CXXFLAGS="$CXXFLAGS -O0 -g" - elif test "$GCC" = "yes"; then - CFLAGS="$CFLAGS -O0 -g3" - CXXFLAGS="$CXXFLAGS -O0 -g3" - fi -fi -AC_ARG_ENABLE(development-mode, - [AS_HELP_STRING([--enable-development-mode], - [Use development mode (default=no)])], - [GARROW_DEVELOPMENT_MODE="$enableval"], - [GARROW_DEVELOPMENT_MODE="no"]) -if test "x$GARROW_DEVELOPMENT_MODE" != "xno"; then - if test "$CLANG" = "yes" -o "$GCC" = "yes"; then - CFLAGS="$CFLAGS -Werror" - CXXFLAGS="$CXXFLAGS -Werror" - fi -fi -AC_SUBST(GARROW_CFLAGS) -AC_SUBST(GARROW_CXXFLAGS) - -AM_PATH_GLIB_2_0([2.32.4], - [], - [AC_MSG_ERROR(GLib isn't available)], - [gobject gio]) - -GOBJECT_INTROSPECTION_CHECK([1.32.1]) -GTK_DOC_CHECK([1.18-2]) - -AC_ARG_WITH(arrow-cpp-build-type, - [AS_HELP_STRING([--with-arrow-cpp-build-type=TYPE], - [-DCMAKE_BUILD_TYPE option value for Arrow C++ (default=release)])], - [GARROW_ARROW_CPP_BUILD_TYPE="$withval"], - [GARROW_ARROW_CPP_BUILD_TYPE="release"]) - -ARROW_CUDA_PKG_CONFIG_PATH="" -AC_ARG_WITH(arrow-cpp-build-dir, - [AS_HELP_STRING([--with-arrow-cpp-build-dir=PATH], - [Use this option to build with not installed Arrow C++])], - [GARROW_ARROW_CPP_BUILD_DIR="$withval"], - [GARROW_ARROW_CPP_BUILD_DIR=""]) -if test "x$GARROW_ARROW_CPP_BUILD_DIR" = "x"; then - USE_ARROW_BUILD_DIR=no - - arrow_packages="arrow" - arrow_packages="${arrow_packages} arrow-compute" - arrow_packages="${arrow_packages} arrow-csv" - arrow_packages="${arrow_packages} arrow-filesystem" - arrow_packages="${arrow_packages} arrow-json" - PKG_CHECK_MODULES([ARROW], [${arrow_packages}]) - _PKG_CONFIG(ARROW_LIB_DIR, [variable=libdir], [arrow]) - ARROW_LIB_DIR="$pkg_cv_ARROW_LIB_DIR" - PKG_CHECK_MODULES([ARROW_ORC], - [arrow-orc], - [HAVE_ARROW_ORC=yes], - [HAVE_ARROW_ORC=no]) - PKG_CHECK_MODULES([ARROW_CUDA], - [arrow-cuda], - [HAVE_ARROW_CUDA=yes], - [HAVE_ARROW_CUDA=no]) - PKG_CHECK_MODULES([ARROW_DATASET], - [arrow-dataset], - [HAVE_ARROW_DATASET=yes], - [HAVE_ARROW_DATASET=no]) - PKG_CHECK_MODULES([GANDIVA], - [gandiva], - [HAVE_GANDIVA=yes], - [HAVE_GANDIVA=no]) - PKG_CHECK_MODULES([PARQUET], - [parquet], - [HAVE_PARQUET=yes], - [HAVE_PARQUET=no]) - PKG_CHECK_MODULES([PLASMA], - [plasma], - [HAVE_PLASMA=yes], - [HAVE_PLASMA=no]) -else - USE_ARROW_BUILD_DIR=yes - - ARROW_BUILD_DIR="${GARROW_ARROW_CPP_BUILD_DIR}" - AC_SUBST(ARROW_BUILD_DIR) - - ARROW_SOURCE_INCLUDE_DIR="\$(abs_top_srcdir)/../cpp/src" - ARROW_BUILD_INCLUDE_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/src" - ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}" - AC_SUBST(ARROW_LIB_DIR) - - ARROW_CFLAGS="-I${ARROW_BUILD_INCLUDE_DIR} -I${ARROW_SOURCE_INCLUDE_DIR}" - ARROW_LIBS="-L\$(ARROW_LIB_DIR) -larrow" - AC_SUBST(ARROW_CFLAGS) - AC_SUBST(ARROW_LIBS) - - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/adapters/orc/arrow-orc.pc"; then - HAVE_ARROW_ORC=yes - else - HAVE_ARROW_ORC=no - fi - - ARROW_CUDA_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-cuda.pc"; then - HAVE_ARROW_CUDA=yes - ARROW_CUDA_LIBS="-L\$(ARROW_LIB_DIR) -larrow_cuda -larrow" - ARROW_CUDA_PKG_CONFIG_PATH="\$(ARROW_BUILD_DIR)/src/arrow/gpu" - else - HAVE_ARROW_CUDA=no - ARROW_CUDA_LIBS="" - ARROW_CUDA_PKG_CONFIG_PATH="" - fi - AC_SUBST(ARROW_CUDA_CFLAGS) - AC_SUBST(ARROW_CUDA_LIBS) - AC_SUBST(ARROW_CUDA_PKG_CONFIG_PATH) - - ARROW_DATASET_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/dataset/arrow-dataset.pc"; then - HAVE_ARROW_DATASET=yes - ARROW_DATASET_LIBS="-L\$(ARROW_LIB_DIR) -larrow_dataset -lparquet -larrow" - else - HAVE_ARROW_DATASET=no - ARROW_DATASET_LIBS="" - fi - AC_SUBST(ARROW_DATASET_CFLAGS) - AC_SUBST(ARROW_DATASET_LIBS) - - GANDIVA_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/gandiva/gandiva.pc"; then - HAVE_GANDIVA=yes - GANDIVA_LIBS="-L\$(ARROW_LIB_DIR) -lgandiva -larrow" - else - HAVE_GANDIVA=no - GANDIVA_LIBS="" - fi - AC_SUBST(GANDIVA_CFLAGS) - AC_SUBST(GANDIVA_LIBS) - - PARQUET_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/parquet/parquet.pc"; then - HAVE_PARQUET=yes - PARQUET_LIBS="-L\$(ARROW_LIB_DIR) -lparquet -larrow" - else - HAVE_PARQUET=no - PARQUET_LIBS="" - fi - AC_SUBST(PARQUET_CFLAGS) - AC_SUBST(PARQUET_LIBS) - - PLASMA_CFLAGS="\$(ARROW_CFLAGS)" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/plasma/plasma.pc"; then - HAVE_PLASMA=yes - PLASMA_LIBS="-L\$(ARROW_LIB_DIR) -lplasma -larrow" - else - HAVE_PLASMA=no - PLASMA_LIBS="" - fi - AC_SUBST(PLASMA_CFLAGS) - AC_SUBST(PLASMA_LIBS) -fi - -AM_CONDITIONAL([USE_ARROW_BUILD_DIR], - [test "$USE_ARROW_BUILD_DIR" = "yes"]) - -AM_CONDITIONAL([HAVE_ARROW_ORC], [test "$HAVE_ARROW_ORC" = "yes"]) -if test "$HAVE_ARROW_ORC" = "yes"; then - AC_DEFINE(HAVE_ARROW_ORC, [1], [Define to 1 if Apache Arrow supports ORC.]) -fi - -AM_CONDITIONAL([HAVE_ARROW_CUDA], [test "$HAVE_ARROW_CUDA" = "yes"]) -if test "$HAVE_ARROW_CUDA" = "yes"; then - ARROW_CUDA_GLIB_PACKAGE="arrow-cuda-glib" - PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":\$(abs_top_builddir)/arrow-cuda-glib" - if test -n "${ARROW_CUDA_PKG_CONFIG_PATH}"; then - PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":${ARROW_CUDA_PKG_CONFIG_PATH}${PLASMA_ARROW_CUDA_PKG_CONFIG_PATH}" - fi - AC_DEFINE(HAVE_ARROW_CUDA, [1], [Define to 1 if Apache Arrow supports CUDA.]) -else - ARROW_CUDA_GLIB_PACKAGE="" - PLASMA_ARROW_CUDA_PKG_CONFIG_PATH="" -fi -AC_SUBST(ARROW_CUDA_GLIB_PACKAGE) -AC_SUBST(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH) - -AM_CONDITIONAL([HAVE_ARROW_DATASET], [test "$HAVE_ARROW_DATASET" = "yes"]) -if test "$HAVE_ARROW_DATASET" = "yes"; then - AC_DEFINE(HAVE_ARROW_DATASET, [1], [Define to 1 if Apache Arrow Dataset exists.]) -fi - -AM_CONDITIONAL([HAVE_GANDIVA], [test "$HAVE_GANDIVA" = "yes"]) -if test "$HAVE_GANDIVA" = "yes"; then - AC_DEFINE(HAVE_GANDIVA, [1], [Define to 1 if Gandiva exists.]) -fi - -AM_CONDITIONAL([HAVE_PARQUET], [test "$HAVE_PARQUET" = "yes"]) -if test "$HAVE_PARQUET" = "yes"; then - AC_DEFINE(HAVE_PARQUET, [1], [Define to 1 if Apache Parquet exists.]) -fi - -AM_CONDITIONAL([HAVE_PLASMA], [test "$HAVE_PLASMA" = "yes"]) -if test "$HAVE_PLASMA" = "yes"; then - AC_DEFINE(HAVE_PLASMA, [1], [Define to 1 if Plasma exists.]) -fi - -exampledir="\$(datadir)/arrow-glib/example" -AC_SUBST(exampledir) - -AC_CONFIG_FILES([ - Makefile - arrow-cuda-glib/Makefile - arrow-cuda-glib/arrow-cuda-glib.pc - arrow-dataset-glib/Makefile - arrow-dataset-glib/arrow-dataset-glib.pc - arrow-glib/Makefile - arrow-glib/arrow-glib.pc - arrow-glib/arrow-orc-glib.pc - arrow-glib/version.h - gandiva-glib/Makefile - gandiva-glib/gandiva-glib.pc - gandiva-glib/version.h - parquet-glib/Makefile - parquet-glib/parquet-glib.pc - parquet-glib/version.h - plasma-glib/Makefile - plasma-glib/plasma-glib.pc - doc/Makefile - doc/arrow-dataset-glib/Makefile - doc/arrow-dataset-glib/entities.xml - doc/arrow-glib/Makefile - doc/arrow-glib/entities.xml - doc/gandiva-glib/Makefile - doc/gandiva-glib/entities.xml - doc/parquet-glib/Makefile - doc/parquet-glib/entities.xml - doc/plasma-glib/Makefile - doc/plasma-glib/entities.xml - example/Makefile - example/lua/Makefile -]) - -AC_OUTPUT diff --git a/c_glib/doc/arrow-dataset-glib/Makefile.am b/c_glib/doc/arrow-dataset-glib/Makefile.am deleted file mode 100644 index d1c636143ff..00000000000 --- a/c_glib/doc/arrow-dataset-glib/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if HAVE_ARROW_DATASET -DOC_MODULE = arrow-dataset-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/arrow-dataset-glib \ - $(top_builddir)/arrow-dataset-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GARROW_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=gad \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/arrow-dataset-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/arrow-dataset-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(ARROW_DATASET_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la \ - $(top_builddir)/arrow-dataset-glib/libarrow-dataset-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml index 83f874b81f0..92ae0405dac 100644 --- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml +++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml @@ -41,6 +41,8 @@ Scan + Fragment + diff --git a/c_glib/doc/arrow-glib/Makefile.am b/c_glib/doc/arrow-glib/Makefile.am deleted file mode 100644 index db9f00f39f3..00000000000 --- a/c_glib/doc/arrow-glib/Makefile.am +++ /dev/null @@ -1,80 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -DOC_MODULE = arrow-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/arrow-glib \ - $(top_builddir)/arrow-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GARROW_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=garrow \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/arrow-glib/*.h \ - $(top_builddir)/arrow-glib/*.h - -IGNORE_HFILES = - -if !HAVE_ARROW_ORC -IGNORE_HFILES += \ - $(top_srcdir)/arrow-glib/orc-file-reader.h -endif - -CFILE_GLOB = \ - $(top_srcdir)/arrow-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la - -if HAVE_ARROW_CUDA -DOC_SOURCE_DIR += \ - $(top_srcdir)/arrow-cuda-glib -HFILE_GLOB += \ - $(top_srcdir)/arrow-cuda-glib/*.h -CFILE_GLOB += \ - $(top_srcdir)/arrow-cuda-glib/*.cpp -GTKDOC_LIBS += \ - $(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la -endif - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index ed42b492447..9198b6a13a6 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -179,6 +179,10 @@ Index of deprecated API + + Index of new symbols in 4.0.0 + + Index of new symbols in 3.0.0 diff --git a/c_glib/doc/gandiva-glib/Makefile.am b/c_glib/doc/gandiva-glib/Makefile.am deleted file mode 100644 index 16d333d0ae3..00000000000 --- a/c_glib/doc/gandiva-glib/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if HAVE_GANDIVA -DOC_MODULE = gandiva-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/gandiva-glib \ - $(top_builddir)/gandiva-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GGANDIVA_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=ggandiva \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/gandiva-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/gandiva-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(GANDIVA_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la \ - $(top_builddir)/gandiva-glib/libgandiva-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/parquet-glib/Makefile.am b/c_glib/doc/parquet-glib/Makefile.am deleted file mode 100644 index d125be1b54c..00000000000 --- a/c_glib/doc/parquet-glib/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if HAVE_PARQUET -DOC_MODULE = parquet-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/parquet-glib \ - $(top_builddir)/parquet-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GPARQUET_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=gparquet \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/parquet-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/parquet-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(PARQUET_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/parquet-glib/libparquet-glib.la \ - $(top_builddir)/arrow-glib/libarrow-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/doc/plasma-glib/Makefile.am b/c_glib/doc/plasma-glib/Makefile.am deleted file mode 100644 index df872d6ca31..00000000000 --- a/c_glib/doc/plasma-glib/Makefile.am +++ /dev/null @@ -1,76 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -PLASMA_ARROW_CUDA_GTKDOC_LIBS = -if HAVE_ARROW_CUDA -PLASMA_ARROW_CUDA_GTKDOC_LIBS += \ - $(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la -endif - -if HAVE_PLASMA -DOC_MODULE = plasma-glib - -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml - -DOC_SOURCE_DIR = \ - $(top_srcdir)/plasma-glib \ - $(top_builddir)/plasma-glib - -SCAN_OPTIONS = \ - --deprecated-guards="GPLASMA_DISABLE_DEPRECATED" - -MKDB_OPTIONS = \ - --name-space=gplasma \ - --source-suffixes="c,cpp,h" - -HFILE_GLOB = \ - $(top_srcdir)/plasma-glib/*.h - -IGNORE_HFILES = - -CFILE_GLOB = \ - $(top_srcdir)/plasma-glib/*.cpp - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(ARROW_CFLAGS) \ - $(PLASMA_CFLAGS) - -GTKDOC_LIBS = \ - $(top_builddir)/arrow-glib/libarrow-glib.la \ - $(PLASMA_ARROW_CUDA_GTKDOC_LIBS) \ - $(top_builddir)/plasma-glib/libplasma-glib.la - -include $(top_srcdir)/gtk-doc.make - -CLEANFILES += \ - $(DOC_MODULE)-decl-list.txt \ - $(DOC_MODULE)-decl.txt \ - $(DOC_MODULE)-overrides.txt \ - $(DOC_MODULE)-sections.txt \ - $(DOC_MODULE).types -else -EXTRA_DIST = -endif - -EXTRA_DIST += \ - entities.xml.in \ - meson.build diff --git a/c_glib/example/Makefile.am b/c_glib/example/Makefile.am deleted file mode 100644 index 9e460ecf8e0..00000000000 --- a/c_glib/example/Makefile.am +++ /dev/null @@ -1,64 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -SUBDIRS = \ - lua - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) \ - -DGARROW_DISABLE_DEPRECATED - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) - -AM_LDFLAGS = \ - $(GLIB_LIBS) \ - $(builddir)/../arrow-glib/libarrow-glib.la -if USE_ARROW_BUILD_DIR -AM_LDFLAGS += \ - $(ARROW_LIBS) -endif - -noinst_PROGRAMS = \ - build \ - extension-type \ - read-batch \ - read-stream - -build_SOURCES = \ - build.c - -extension_type_SOURCES = \ - extension-type.c - -read_batch_SOURCES = \ - read-batch.c - -read_stream_SOURCES = \ - read-stream.c - -dist_example_DATA = \ - README.md \ - $(build_SOURCES) \ - $(extension_type_SOURCES) \ - $(read_batch_SOURCES) \ - $(read_stream_SOURCES) diff --git a/c_glib/example/lua/Makefile.am b/c_glib/example/lua/Makefile.am deleted file mode 100644 index 84ddbc7607b..00000000000 --- a/c_glib/example/lua/Makefile.am +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -EXTRA_DIST = \ - meson.build - -lua_exampledir = $(exampledir)/lua -dist_lua_example_DATA = \ - README.md \ - read-batch.lua \ - read-stream.lua \ - write-batch.lua \ - write-stream.lua diff --git a/c_glib/gandiva-glib/Makefile.am b/c_glib/gandiva-glib/Makefile.am deleted file mode 100644 index 5991abeab3a..00000000000 --- a/c_glib/gandiva-glib/Makefile.am +++ /dev/null @@ -1,196 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GGANDIVA_CFLAGS) - -if HAVE_GANDIVA -lib_LTLIBRARIES = \ - libgandiva-glib.la - -libgandiva_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(GANDIVA_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GGANDIVA_CFLAGS) - -libgandiva_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libgandiva_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(GANDIVA_LIBS) - -libgandiva_glib_la_headers = \ - expression.h \ - function-registry.h \ - function-signature.h \ - gandiva-glib.h \ - native-function.h \ - node.h \ - projector.h - -libgandiva_glib_la_generated_headers = \ - enums.h \ - version.h - -libgandiva_glib_la_generated_sources = \ - enums.c \ - $(libgandiva_glib_la_generated_headers) - -libgandiva_glib_la_sources = \ - expression.cpp \ - function-registry.cpp \ - function-signature.cpp \ - node.cpp \ - native-function.cpp \ - projector.cpp \ - $(libgandiva_glib_la_headers) \ - $(libgandiva_glib_la_generated_sources) - -libgandiva_glib_la_cpp_headers = \ - expression.hpp \ - function-signature.hpp \ - gandiva-glib.hpp \ - native-function.hpp \ - node.hpp \ - projector.hpp - -libgandiva_glib_la_SOURCES = \ - $(libgandiva_glib_la_sources) \ - $(libgandiva_glib_la_cpp_headers) - -BUILT_SOURCES = \ - $(libgandiva_glib_la_generated_sources) \ - stamp-enums.c \ - stamp-enums.h - -DISTCLEANFILES += \ - stamp-enums.c \ - stamp-enums.h - -EXTRA_DIST += \ - enums.c.template \ - enums.h.template - -enums.h: stamp-enums.h - @true -stamp-enums.h: $(libgandiva_glib_la_headers) enums.h.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GGandiva \ - --symbol-prefix ggandiva \ - --template enums.h.template \ - $(libgandiva_glib_la_headers)) > enums.h - touch $@ - -enums.c: stamp-enums.c - @true -stamp-enums.c: $(libarrow_glib_la_headers) enums.c.template - $(AM_V_GEN) \ - (cd $(srcdir) && \ - $(GLIB_MKENUMS) \ - --identifier-prefix GGandiva \ - --symbol-prefix ggandiva \ - --template enums.c.template \ - $(libgandiva_glib_la_headers)) > enums.c - touch $@ - -gandiva_glib_includedir = $(includedir)/gandiva-glib -gandiva_glib_include_HEADERS = \ - $(libgandiva_glib_la_headers) \ - $(libgandiva_glib_la_cpp_headers) \ - $(libgandiva_glib_la_generated_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - gandiva-glib.pc - -# GObject Introspection -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_top_builddir)/arrow-glib - -Gandiva-1.0.gir: libgandiva-glib.la -Gandiva_1_0_gir_PACKAGES = \ - arrow-glib -Gandiva_1_0_gir_EXPORT_PACKAGES = \ - gandiva-glib -Gandiva_1_0_gir_INCLUDES = \ - Arrow-1.0 -Gandiva_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Gandiva_1_0_gir_LIBS = -Gandiva_1_0_gir_FILES = $(libgandiva_glib_la_sources) -Gandiva_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_top_builddir)/arrow-glib \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GGandiva \ - --symbol-prefix=ggandiva -if OS_MACOS -Gandiva_1_0_gir_LIBS += \ - arrow-glib \ - gandiva-glib -Gandiva_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_top_builddir)/arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -Gandiva_1_0_gir_LIBS += \ - $(abs_top_builddir)/arrow-glib/libarrow-glib.la \ - libgandiva-glib.la -endif -INTROSPECTION_GIRS += Gandiva-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/gandiva-glib/gandiva-glib.pc.in b/c_glib/gandiva-glib/gandiva-glib.pc.in deleted file mode 100644 index 7160f5ff422..00000000000 --- a/c_glib/gandiva-glib/gandiva-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Arrow Gandiva GLib -Description: C API for Apache Arrow Gandiva based on GLib -Version: @VERSION@ -Libs: -L${libdir} -lgandiva-glib -Cflags: -I${includedir} -Requires: gandiva arrow-glib diff --git a/c_glib/parquet-glib/Makefile.am b/c_glib/parquet-glib/Makefile.am deleted file mode 100644 index a813b3ce9cc..00000000000 --- a/c_glib/parquet-glib/Makefile.am +++ /dev/null @@ -1,145 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPARQUET_CFLAGS) - -if HAVE_PARQUET -lib_LTLIBRARIES = \ - libparquet-glib.la - -libparquet_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(PARQUET_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPARQUET_CXXFLAGS) - -libparquet_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libparquet_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(PARQUET_LIBS) - -libparquet_glib_la_headers = \ - arrow-file-reader.h \ - arrow-file-writer.h \ - parquet-glib.h - -libparquet_glib_la_generated_headers = \ - version.h - -libparquet_glib_la_sources = \ - arrow-file-reader.cpp \ - arrow-file-writer.cpp \ - $(libparquet_glib_la_headers) \ - $(libparquet_glib_la_generated_headers) - -libparquet_glib_la_cpp_headers = \ - arrow-file-reader.hpp \ - arrow-file-writer.hpp \ - parquet-glib.hpp - -libparquet_glib_la_SOURCES = \ - $(libparquet_glib_la_sources) \ - $(libparquet_glib_la_cpp_headers) - -BUILT_SOURCES = \ - $(libparquet_glib_la_generated_headers) - -parquet_glib_includedir = $(includedir)/parquet-glib -parquet_glib_include_HEADERS = \ - $(libparquet_glib_la_headers) \ - $(libparquet_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - parquet-glib.pc - -# GObject Introspection -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_top_builddir)/arrow-glib - -Parquet-1.0.gir: libparquet-glib.la -Parquet_1_0_gir_PACKAGES = \ - arrow-glib -Parquet_1_0_gir_EXPORT_PACKAGES = \ - parquet-glib -Parquet_1_0_gir_INCLUDES = \ - Arrow-1.0 -Parquet_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Parquet_1_0_gir_LIBS = -Parquet_1_0_gir_FILES = $(libparquet_glib_la_sources) -Parquet_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_top_builddir)/arrow-glib \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GParquet \ - --symbol-prefix=gparquet -if OS_MACOS -Parquet_1_0_gir_LIBS += \ - arrow-glib \ - parquet-glib -Parquet_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_top_builddir)/arrow-glib/.libs \ - --library-path=$(abs_builddir)/.libs -else -Parquet_1_0_gir_LIBS += \ - $(abs_top_builddir)/arrow-glib/libarrow-glib.la \ - libparquet-glib.la -endif -INTROSPECTION_GIRS += Parquet-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/parquet-glib/parquet-glib.pc.in b/c_glib/parquet-glib/parquet-glib.pc.in deleted file mode 100644 index 81559f1bce1..00000000000 --- a/c_glib/parquet-glib/parquet-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Parquet GLib -Description: C API for Apache Parquet based on GLib -Version: @VERSION@ -Libs: -L${libdir} -lparquet-glib -Cflags: -I${includedir} -Requires: arrow-glib diff --git a/c_glib/plasma-glib/Makefile.am b/c_glib/plasma-glib/Makefile.am deleted file mode 100644 index 60499a4065f..00000000000 --- a/c_glib/plasma-glib/Makefile.am +++ /dev/null @@ -1,171 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -CLEANFILES = -DISTCLEANFILES = - -EXTRA_DIST = \ - meson.build - -AM_CPPFLAGS = \ - -I$(top_builddir) \ - -I$(top_srcdir) \ - -DG_LOG_DOMAIN=\"Plasma\" - -AM_CFLAGS = \ - $(GLIB_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPLASMA_CFLAGS) - -PLASMA_ARROW_CUDA_LIBS = -PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS = -PLASMA_GIR_ARROW_CUDA_PACKAGE = -PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH = -PLASMA_GIR_ARROW_CUDA_LIBS_MACOS = -PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS = -PLASMA_GIR_ARROW_CUDA_LIBS = -if HAVE_ARROW_CUDA -PLASMA_ARROW_CUDA_LIBS += \ - ../arrow-cuda-glib/libarrow-cuda-glib.la \ - $(ARROW_CUDA_LIBS) -PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS += \ - --includedir=$(abs_top_builddir)/arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_PACKAGE += \ - arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH += \ - --add-include-path=$(abs_top_builddir)/arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_LIBS_MACOS += \ - arrow-cuda-glib -PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS += \ - --library-path=$(abs_top_builddir)/arrow-cuda-glib/.libs -PLASMA_GIR_ARROW_CUDA_LIBS += \ - $(abs_top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la -endif - -if HAVE_PLASMA -lib_LTLIBRARIES = \ - libplasma-glib.la - -libplasma_glib_la_CXXFLAGS = \ - $(GLIB_CFLAGS) \ - $(PLASMA_CFLAGS) \ - $(GARROW_CFLAGS) \ - $(GPLASMA_CFLAGS) - -libplasma_glib_la_LDFLAGS = \ - -version-info $(LT_VERSION_INFO) \ - -no-undefined - -libplasma_glib_la_LIBADD = \ - $(GLIB_LIBS) \ - ../arrow-glib/libarrow-glib.la \ - $(PLASMA_LIBS) \ - $(PLASMA_ARROW_CUDA_LIBS) - -libplasma_glib_la_headers = \ - client.h \ - object.h \ - plasma-glib.h - -libplasma_glib_la_sources = \ - client.cpp \ - object.cpp \ - $(libplasma_glib_la_headers) - -libplasma_glib_la_cpp_headers = \ - client.hpp \ - object.hpp \ - plasma-glib.hpp - -libplasma_glib_la_SOURCES = \ - $(libplasma_glib_la_sources) \ - $(libplasma_glib_la_cpp_headers) - -plasma_glib_includedir = $(includedir)/plasma-glib -plasma_glib_include_HEADERS = \ - $(libplasma_glib_la_headers) \ - $(libplasma_glib_la_cpp_headers) - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = \ - plasma-glib.pc - -# GObject Introspection -if HAVE_INTROSPECTION --include $(INTROSPECTION_MAKEFILE) -INTROSPECTION_GIRS = -INTROSPECTION_SCANNER_ARGS = -INTROSPECTION_SCANNER_ENV = -if USE_ARROW_BUILD_DIR -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} -else -INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$${PKG_CONFIG_PATH} -endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_top_builddir)/arrow-glib \ - $(PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_INCLUDEDIR) - -Plasma-1.0.gir: libplasma-glib.la -Plasma_1_0_gir_PACKAGES = \ - arrow-glib \ - $(PLASMA_GIR_ARROW_CUDA_PACKAGE) -Plasma_1_0_gir_EXPORT_PACKAGES = \ - plasma-glib -Plasma_1_0_gir_INCLUDES = \ - Arrow-1.0 -Plasma_1_0_gir_CFLAGS = \ - $(AM_CPPFLAGS) -Plasma_1_0_gir_LIBS = -Plasma_1_0_gir_FILES = $(libplasma_glib_la_sources) -Plasma_1_0_gir_SCANNERFLAGS = \ - --add-include-path=$(abs_top_builddir)/arrow-glib \ - $(PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH) \ - --library-path=$(ARROW_LIB_DIR) \ - --warn-all \ - --identifier-prefix=GPlasma \ - --symbol-prefix=gplasma -if OS_MACOS -Plasma_1_0_gir_LIBS += \ - arrow-glib \ - $(PLASMA_GIR_ARROW_CUDA_LIBS_MACOS) \ - plasma-glib -Plasma_1_0_gir_SCANNERFLAGS += \ - --no-libtool \ - --library-path=$(abs_top_builddir)/arrow-glib/.libs \ - $(PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS) \ - --library-path=$(abs_builddir)/.libs -else -Plasma_1_0_gir_LIBS += \ - $(abs_top_builddir)/arrow-glib/libarrow-glib.la \ - $(PLASMA_GIR_ARROW_CUDA_LIBS) \ - libplasma-glib.la -endif -INTROSPECTION_GIRS += Plasma-1.0.gir - -girdir = $(datadir)/gir-1.0 -gir_DATA = $(INTROSPECTION_GIRS) - -typelibdir = $(libdir)/girepository-1.0 -typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) - -CLEANFILES += \ - $(gir_DATA) \ - $(typelib_DATA) -endif -endif diff --git a/c_glib/plasma-glib/plasma-glib.pc.in b/c_glib/plasma-glib/plasma-glib.pc.in deleted file mode 100644 index c82fe69580f..00000000000 --- a/c_glib/plasma-glib/plasma-glib.pc.in +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -Name: Apache Arrow Plasma GLib -Description: C API for Apache Arrow Plasma based on GLib -Version: @VERSION@ -Libs: -L${libdir} -lplasma-glib -Cflags: -I${includedir} -Requires: plasma arrow-glib @ARROW_CUDA_GLIB_PACKAGE@ diff --git a/c_glib/test/dataset/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-in-memory-scan-task.rb index 2749012b717..06e3d0d2424 100644 --- a/c_glib/test/dataset/test-in-memory-scan-task.rb +++ b/c_glib/test/dataset/test-in-memory-scan-task.rb @@ -40,21 +40,18 @@ def setup @scan_options = ArrowDataset::ScanOptions.new(@schema) - @scan_context = ArrowDataset::ScanContext.new + @fragment = ArrowDataset::InMemoryFragment.new(@schema, + @record_batches) @scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches, @scan_options, - @scan_context) + @fragment) end def test_scan_options assert_equal(@scan_options, @scan_task.options) end - def test_scan_context - assert_equal(@scan_context, @scan_task.context) - end - def test_execute assert_equal(@record_batches, @scan_task.execute.to_list) diff --git a/c_glib/test/dataset/test-scan-options.rb b/c_glib/test/dataset/test-scan-options.rb index 1f5b77f2e9f..0536b2a7cca 100644 --- a/c_glib/test/dataset/test-scan-options.rb +++ b/c_glib/test/dataset/test-scan-options.rb @@ -35,10 +35,13 @@ def test_batch_size @scan_options.batch_size) end - def test_replace_schema - other_schema = Arrow::Schema.new([Arrow::Field.new("visible", Arrow::BooleanDataType.new)]) - other_scan_options = @scan_options.replace_schema(other_schema) - assert_not_equal(@schema, other_scan_options.schema) - assert_equal(other_schema, other_scan_options.schema) + def test_use_threads + assert do + not @scan_options.use_threads? + end + @scan_options.use_threads = true + assert do + @scan_options.use_threads? + end end end diff --git a/c_glib/test/test-array.rb b/c_glib/test/test-array.rb index ee176b9949e..c03aecf1732 100644 --- a/c_glib/test/test-array.rb +++ b/c_glib/test/test-array.rb @@ -158,4 +158,31 @@ def test_different_type array.diff_unified(other_array)) end end + + sub_test_case("#concatenate") do + def test_no_other_arrays + assert_equal(build_int32_array([1, 2, 3]), + build_int32_array([1, 2, 3]).concatenate([])) + end + + def test_multiple_other_arrays + a = build_int32_array([1, 2, 3]) + b = build_int32_array([4]) + c = build_int32_array([5, 6]) + assert_equal(build_int32_array([1, 2, 3, 4, 5, 6]), + a.concatenate([b, c])) + end + + def test_mixed_type + int32_array = build_int32_array([1, 2, 3]) + uint32_array = build_uint32_array([4]) + message = + "[array][concatenate]: Invalid: " + + "arrays to be concatenated must be identically typed, " + + "but int32 and uint32 were encountered." + assert_raise(Arrow::Error::Invalid.new(message)) do + int32_array.concatenate([uint32_array]) + end + end + end end diff --git a/c_glib/test/test-chunked-array.rb b/c_glib/test/test-chunked-array.rb index 82b46968a0d..8f912ac846b 100644 --- a/c_glib/test/test-chunked-array.rb +++ b/c_glib/test/test-chunked-array.rb @@ -128,4 +128,14 @@ def test_to_s ] PRETTY_PRINT end + + def test_combine + chunks = [ + build_boolean_array([true]), + build_boolean_array([false, nil]), + ] + chunked_array = Arrow::ChunkedArray.new(chunks) + assert_equal(build_boolean_array([true, false, nil]), + chunked_array.combine) + end end diff --git a/c_glib/test/test-decimal128-data-type.rb b/c_glib/test/test-decimal128-data-type.rb index a02e3badca0..b27e1cad1ea 100644 --- a/c_glib/test/test-decimal128-data-type.rb +++ b/c_glib/test/test-decimal128-data-type.rb @@ -23,12 +23,12 @@ def test_type def test_name data_type = Arrow::Decimal128DataType.new(2, 0) - assert_equal("decimal", data_type.name) + assert_equal("decimal128", data_type.name) end def test_to_s data_type = Arrow::Decimal128DataType.new(2, 0) - assert_equal("decimal(2, 0)", data_type.to_s) + assert_equal("decimal128(2, 0)", data_type.to_s) end def test_precision diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index e6e5ac859dd..374db7eb2f6 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -23,7 +23,7 @@ ARG arch=amd64 ARG maven=3.5 ARG node=14 ARG jdk=8 -ARG go=1.12 +ARG go=1.15 COPY ci/conda_env_archery.yml /arrow/ci/ RUN conda install -q \ diff --git a/ci/docker/debian-10-go.dockerfile b/ci/docker/debian-10-go.dockerfile index 594a30ad786..199f09e24fc 100644 --- a/ci/docker/debian-10-go.dockerfile +++ b/ci/docker/debian-10-go.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG arch=amd64 -ARG go=1.12 +ARG go=1.15 FROM ${arch}/golang:${go} # TODO(kszucs): diff --git a/ci/docker/python-wheel-manylinux-201x.dockerfile b/ci/docker/python-wheel-manylinux-201x.dockerfile index 2bdb7a926cd..19246a46764 100644 --- a/ci/docker/python-wheel-manylinux-201x.dockerfile +++ b/ci/docker/python-wheel-manylinux-201x.dockerfile @@ -18,11 +18,14 @@ ARG base FROM ${base} +ARG arch_alias +ARG arch_short_alias + RUN yum install -y git flex curl autoconf zip wget # Install CMake -ARG cmake=3.19.2 -RUN wget -q https://github.com/Kitware/CMake/releases/download/v${cmake}/cmake-${cmake}-Linux-x86_64.tar.gz -O - | \ +ARG cmake=3.19.3 +RUN wget -q https://github.com/Kitware/CMake/releases/download/v${cmake}/cmake-${cmake}-Linux-${arch_alias}.tar.gz -O - | \ tar -xzf - --directory /usr/local --strip-components=1 # Install Ninja @@ -51,23 +54,27 @@ RUN mkdir /tmp/ccache && \ ARG vcpkg RUN git clone https://github.com/microsoft/vcpkg /opt/vcpkg && \ git -C /opt/vcpkg checkout ${vcpkg} && \ - /opt/vcpkg/bootstrap-vcpkg.sh --useSystemBinaries --disableMetrics && \ + /opt/vcpkg/bootstrap-vcpkg.sh -useSystemBinaries -disableMetrics && \ ln -s /opt/vcpkg/vcpkg /usr/bin/vcpkg # Patch ports files as needed COPY ci/vcpkg arrow/ci/vcpkg -RUN cd /opt/vcpkg && patch -p1 -i /arrow/ci/vcpkg/ports.patch +RUN cd /opt/vcpkg && git apply --ignore-whitespace /arrow/ci/vcpkg/ports.patch ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_FORCE_SYSTEM_BINARIES=1 \ VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg \ - VCPKG_DEFAULT_TRIPLET=x64-linux-static-${build_type} \ + VCPKG_DEFAULT_TRIPLET=${arch_short_alias}-linux-static-${build_type} \ VCPKG_FEATURE_FLAGS=-manifests +# Need to install the boost-build prior installing the boost packages, otherwise +# vcpkg will raise an error. # TODO(kszucs): factor out the package enumeration to a text file and reuse it # from the windows image and potentially in a future macos wheel build RUN vcpkg install --clean-after-build \ + boost-build:${arch_short_alias}-linux && \ + vcpkg install --clean-after-build \ abseil \ aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer] \ boost-filesystem \ diff --git a/dev/release/source/Dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile similarity index 72% rename from dev/release/source/Dockerfile rename to ci/docker/python-wheel-manylinux-test.dockerfile index 7d5453b80c4..55c27d1d7bb 100644 --- a/dev/release/source/Dockerfile +++ b/ci/docker/python-wheel-manylinux-test.dockerfile @@ -15,17 +15,13 @@ # specific language governing permissions and limitations # under the License. -FROM debian:buster +ARG arch +ARG python +FROM ${arch}/python:${python} -ENV DEBIAN_FRONTEND noninteractive +# RUN pip install --upgrade pip -RUN apt update && \ - apt install -y -V \ - autoconf-archive \ - gtk-doc-tools \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libtool \ - pkg-config && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* +# pandas doesn't provide wheel for aarch64 yet, so cache the compiled +# test dependencies in a docker image +COPY python/requirements-wheel-test.txt /arrow/python/ +RUN pip install -r /arrow/python/requirements-wheel-test.txt diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index c0b85d47938..0f66a20396e 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -35,7 +35,7 @@ RUN git clone https://github.com/Microsoft/vcpkg && \ # Patch ports files as needed COPY ci/vcpkg arrow/ci/vcpkg -RUN cd vcpkg && patch -p1 -i C:/arrow/ci/vcpkg/ports.patch +RUN cd vcpkg && git apply --ignore-whitespace C:/arrow/ci/vcpkg/ports.patch # Configure vcpkg and install dependencies # NOTE: use windows batch environment notation for build arguments in RUN diff --git a/ci/docker/ubuntu-16.04-cpp.dockerfile b/ci/docker/ubuntu-16.04-cpp.dockerfile deleted file mode 100644 index 5c98ae30e1e..00000000000 --- a/ci/docker/ubuntu-16.04-cpp.dockerfile +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base=amd64/ubuntu:16.04 -FROM ${base} - -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -ENV DEBIAN_FRONTEND noninteractive - -# LLVM 10 or later requires C++ 14 but g++-5's C++ 14 support is limited. -# cpp/src/arrow/vendored/datetime/date.h doesn't work. -# ARG llvm -ENV llvm=8 -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - software-properties-common \ - wget && \ - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - apt-add-repository -y "deb https://apt.llvm.org/xenial/ llvm-toolchain-xenial-${llvm} main" && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - autoconf \ - ca-certificates \ - ccache \ - clang-${llvm} \ - cmake \ - g++ \ - gcc \ - gdb \ - git \ - libboost-all-dev \ - libbrotli-dev \ - libbz2-dev \ - libgoogle-glog-dev \ - liblz4-dev \ - libre2-dev \ - libssl-dev \ - libutf8proc-dev \ - libzstd1-dev \ - llvm-${llvm}-dev \ - make \ - ninja-build \ - pkg-config \ - protobuf-compiler \ - python3 \ - tzdata && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Benchmark is deactivated as the external project requires CMake 3.6+ -# Gandiva JNI is deactivated as it requires CMake 3.11+ -# - c-ares in Xenial isn't recognized by gRPC build system -# - libprotobuf-dev / libprotoc-dev in Xenial too old for gRPC -# - libboost-all-dev does not include Boost.Process, needed for Flight -# unit tests, so doing vendored build by default -ENV ARROW_BUILD_BENCHMARKS=OFF \ - ARROW_BUILD_TESTS=ON \ - ARROW_DATASET=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_GANDIVA_JAVA=OFF \ - ARROW_GANDIVA=ON \ - ARROW_HOME=/usr/local \ - ARROW_PARQUET=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - BOOST_SOURCE=BUNDLED \ - cares_SOURCE=BUNDLED \ - CC=gcc \ - CXX=g++ \ - gRPC_SOURCE=BUNDLED \ - GTest_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ - PARQUET_BUILD_EXAMPLES=ON \ - PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH \ - Protobuf_SOURCE=BUNDLED \ - RapidJSON_SOURCE=BUNDLED \ - Snappy_SOURCE=BUNDLED \ - Thrift_SOURCE=BUNDLED diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 1d9e41bba7a..82340e08174 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -79,8 +79,10 @@ build() { export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include" export LIBS="-L${MINGW_PREFIX}/libs" export ARROW_S3=OFF + export ARROW_WITH_RE2=OFF else export ARROW_S3=ON + export ARROW_WITH_RE2=ON fi MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \ @@ -105,12 +107,13 @@ build() { -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \ -DARROW_WITH_SNAPPY=ON \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ -DARROW_ZSTD_USE_SHARED=OFF \ -DARROW_CXXFLAGS="${CPPFLAGS}" \ - -DCMAKE_BUILD_TYPE="release" \ + -DCMAKE_BUILD_TYPE="debug" \ -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} \ -DCMAKE_UNITY_BUILD=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh index bb53bd82131..7093be4d238 100755 --- a/ci/scripts/go_build.sh +++ b/ci/scripts/go_build.sh @@ -27,3 +27,10 @@ go get -d -t -v ./... go install -v ./... popd + +pushd ${source_dir}/parquet + +go get -d -t -v ./... +go install -v ./... + +popd diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh index 077749fc945..7dd873df3e1 100755 --- a/ci/scripts/go_test.sh +++ b/ci/scripts/go_test.sh @@ -28,3 +28,11 @@ for d in $(go list ./... | grep -v vendor); do done popd + +pushd ${source_dir}/parquet + +for d in $(go list ./... | grep -v vendor); do + go test $d +done + +popd diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index 0fb070a6d08..954ce3249d9 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -27,7 +27,7 @@ fi dask=$1 if [ "${dask}" = "master" ]; then - pip install https://github.com/dask/dask/archive/master.tar.gz#egg=dask[dataframe] + pip install https://github.com/dask/dask/archive/main.tar.gz#egg=dask[dataframe] elif [ "${dask}" = "latest" ]; then conda install -q dask else diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index aa23e5b7c18..5d2e71916ed 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -33,3 +33,4 @@ archery integration --with-all --run-flight \ --gold-dirs=$gold_dir/1.0.0-bigendian \ --gold-dirs=$gold_dir/1.0.0-littleendian \ --gold-dirs=$gold_dir/2.0.0-compression \ + --gold-dirs=$gold_dir/4.0.0-shareddict \ diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh index 3f451e96b83..cb6ca30a64e 100755 --- a/ci/scripts/msys2_setup.sh +++ b/ci/scripts/msys2_setup.sh @@ -61,6 +61,7 @@ esac pacman \ --needed \ --noconfirm \ + --refresh \ --sync \ "${packages[@]}" diff --git a/ci/scripts/msys2_system_clean.sh b/ci/scripts/msys2_system_clean.sh index 57ecc256e20..a356aee6660 100755 --- a/ci/scripts/msys2_system_clean.sh +++ b/ci/scripts/msys2_system_clean.sh @@ -29,4 +29,5 @@ pacman \ ${MINGW_PACKAGE_PREFIX}-gcc-ada \ ${MINGW_PACKAGE_PREFIX}-gcc-fortran \ ${MINGW_PACKAGE_PREFIX}-gcc-libgfortran \ - ${MINGW_PACKAGE_PREFIX}-gcc-objc + ${MINGW_PACKAGE_PREFIX}-gcc-objc \ + ${MINGW_PACKAGE_PREFIX}-libgccjit diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh new file mode 100755 index 00000000000..7a021f70f74 --- /dev/null +++ b/ci/scripts/python_wheel_macos_build.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +source_dir=${1} +build_dir=${2} + +echo "=== (${PYTHON_VERSION}) Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf ${build_dir}/install +rm -rf ${source_dir}/python/dist +rm -rf ${source_dir}/python/build +rm -rf ${source_dir}/python/repaired_wheels +rm -rf ${source_dir}/python/pyarrow/*.so +rm -rf ${source_dir}/python/pyarrow/*.so.* + +echo "=== (${PYTHON_VERSION}) Set OSX SDK and C flags ===" +# Arrow is 64-bit-only at the moment +export CFLAGS="-fPIC -arch x86_64 ${CFLAGS//-arch i386/}" +export CXXFLAGS="-fPIC -arch x86_64 ${CXXFLAGS//-arch i386} -std=c++11" +export SDKROOT="$(xcrun --show-sdk-path)" + +echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" +: ${ARROW_DATASET:=ON} +: ${ARROW_FLIGHT:=ON} +: ${ARROW_GANDIVA:=OFF} +: ${ARROW_HDFS:=ON} +: ${ARROW_JEMALLOC:=ON} +: ${ARROW_MIMALLOC:=ON} +: ${ARROW_ORC:=ON} +: ${ARROW_PARQUET:=ON} +: ${ARROW_PLASMA:=ON} +: ${ARROW_S3:=ON} +: ${ARROW_TENSORFLOW:=ON} +: ${ARROW_WITH_BROTLI:=ON} +: ${ARROW_WITH_BZ2:=ON} +: ${ARROW_WITH_LZ4:=ON} +: ${ARROW_WITH_SNAPPY:=ON} +: ${ARROW_WITH_ZLIB:=ON} +: ${ARROW_WITH_ZSTD:=ON} +: ${CMAKE_BUILD_TYPE:=release} +: ${CMAKE_GENERATOR:=Ninja} +: ${VCPKG_FEATURE_FLAGS:=-manifests} +: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-osx-static-${CMAKE_BUILD_TYPE}}} + +mkdir -p ${build_dir}/build +pushd ${build_dir}/build +cmake \ + -DARROW_BUILD_SHARED=ON \ + -DARROW_BUILD_STATIC=OFF \ + -DARROW_BUILD_TESTS=OFF \ + -DARROW_DATASET=${ARROW_DATASET} \ + -DARROW_DEPENDENCY_SOURCE="VCPKG" \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_FLIGHT==${ARROW_FLIGHT} \ + -DARROW_GANDIVA=${ARROW_GANDIVA} \ + -DARROW_HDFS=${ARROW_HDFS} \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ + -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ + -DARROW_ORC=${ARROW_ORC} \ + -DARROW_PACKAGE_KIND="manylinux${MANYLINUX_VERSION}" \ + -DARROW_PARQUET=${ARROW_PARQUET} \ + -DARROW_PLASMA=${ARROW_PLASMA} \ + -DARROW_PYTHON=ON \ + -DARROW_RPATH_ORIGIN=ON \ + -DARROW_S3=${ARROW_S3} \ + -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \ + -DARROW_USE_CCACHE=ON \ + -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \ + -DARROW_WITH_BZ2=${ARROW_WITH_BZ2} \ + -DARROW_WITH_LZ4=${ARROW_WITH_LZ4} \ + -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY} \ + -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB} \ + -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD} \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${build_dir}/install \ + -DCMAKE_UNITY_BUILD=ON \ + -DOPENSSL_USE_STATIC_LIBS=ON \ + -DVCPKG_MANIFEST_MODE=OFF \ + -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ + -G ${CMAKE_GENERATOR} \ + ${source_dir}/cpp +cmake --build . --target install +popd + +# Check that we don't expose any unwanted symbols +# check_arrow_visibility + +echo "=== (${PYTHON_VERSION}) Building wheel ===" +export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} +export PYARROW_BUNDLE_ARROW_CPP=1 +export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} +export PYARROW_INSTALL_TESTS=1 +export PYARROW_WITH_DATASET=${ARROW_DATASET} +export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} +export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} +export PYARROW_WITH_HDFS=${ARROW_HDFS} +export PYARROW_WITH_ORC=${ARROW_ORC} +export PYARROW_WITH_PARQUET=${ARROW_PARQUET} +export PYARROW_WITH_PLASMA=${ARROW_PLASMA} +export PYARROW_WITH_S3=${ARROW_S3} +# PyArrow build configuration +export PKG_CONFIG_PATH=/usr/lib/pkgconfig:${build_dir}/install/lib/pkgconfig + +pushd ${source_dir}/python +python setup.py bdist_wheel +popd + +echo "=== (${PYTHON_VERSION}) Show dynamic libraries the wheel depend on ===" +deps=$(delocate-listdeps ${source_dir}/python/dist/*.whl) + +if echo $deps | grep -v "^@rpath/lib\(arrow\|gandiva\|parquet\|plasma\)"; then + echo "There are non-bundled shared library dependencies." + exit 1 +fi diff --git a/ci/scripts/python_wheel_macos_test.sh b/ci/scripts/python_wheel_macos_test.sh new file mode 100755 index 00000000000..6ac8576d484 --- /dev/null +++ b/ci/scripts/python_wheel_macos_test.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +source_dir=${1} + +: ${ARROW_S3:=ON} + +export PYARROW_TEST_CYTHON=OFF +export PYARROW_TEST_DATASET=ON +export PYARROW_TEST_GANDIVA=OFF +export PYARROW_TEST_HDFS=ON +export PYARROW_TEST_ORC=ON +export PYARROW_TEST_PANDAS=ON +export PYARROW_TEST_PARQUET=ON +export PYARROW_TEST_PLASMA=ON +export PYARROW_TEST_S3=${ARROW_S3} +export PYARROW_TEST_TENSORFLOW=ON +export PYARROW_TEST_FLIGHT=ON + +export ARROW_TEST_DATA=${source_dir}/testing/data +export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data + +# Install the built wheels +pip install ${source_dir}/python/dist/*.whl + +# Test that the modules are importable +python -c " +import pyarrow +import pyarrow._hdfs +import pyarrow.csv +import pyarrow.dataset +import pyarrow.flight +import pyarrow.fs +import pyarrow.json +import pyarrow.orc +import pyarrow.parquet +import pyarrow.plasma +" + +if [ "${PYARROW_TEST_S3}" == "ON" ]; then + python -c "import pyarrow._s3fs" +fi + +# Install testing dependencies +pip install -r ${source_dir}/python/requirements-wheel-test.txt + +# Execute unittest +pytest -r s --pyargs pyarrow diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 0a52415a0b9..83aa623b49b 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -66,8 +66,10 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_WITH_ZLIB:=ON} : ${ARROW_WITH_ZSTD:=ON} : ${CMAKE_BUILD_TYPE:=release} +: ${CMAKE_UNITY_BUILD:=ON} : ${CMAKE_GENERATOR:=Ninja} : ${VCPKG_FEATURE_FLAGS:=-manifests} +: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} mkdir /tmp/arrow-build pushd /tmp/arrow-build @@ -77,7 +79,7 @@ cmake \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ -DARROW_DATASET=${ARROW_DATASET} \ - -DARROW_DEPENDENCY_SOURCE="SYSTEM" \ + -DARROW_DEPENDENCY_SOURCE="VCPKG" \ -DARROW_DEPENDENCY_USE_SHARED=OFF \ -DARROW_FLIGHT==${ARROW_FLIGHT} \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ @@ -103,13 +105,10 @@ cmake \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \ - -DCMAKE_TOOLCHAIN_FILE=/opt/vcpkg/scripts/buildsystems/vcpkg.cmake \ - -DCMAKE_UNITY_BUILD=ON \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ -DOPENSSL_USE_STATIC_LIBS=ON \ - -DThrift_ROOT=/opt/vcpkg/installed/x64-linux/lib \ - -D_VCPKG_INSTALLED_DIR=/opt/vcpkg/installed \ -DVCPKG_MANIFEST_MODE=OFF \ - -DVCPKG_TARGET_TRIPLET=x64-linux-static-${CMAKE_BUILD_TYPE} \ + -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ -G ${CMAKE_GENERATOR} \ /arrow/cpp cmake --build . --target install @@ -138,8 +137,5 @@ pushd /arrow/python python setup.py bdist_wheel echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ===" -auditwheel repair \ - --plat "manylinux${MANYLINUX_VERSION}_x86_64" \ - -L . dist/pyarrow-*.whl \ - -w repaired_wheels +auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels popd diff --git a/ci/scripts/python_wheel_manylinux_test.sh b/ci/scripts/python_wheel_manylinux_test.sh index d603f7c6a70..21987748f73 100755 --- a/ci/scripts/python_wheel_manylinux_test.sh +++ b/ci/scripts/python_wheel_manylinux_test.sh @@ -17,7 +17,23 @@ # specific language governing permissions and limitations # under the License. -set -ex +set -e +set -x +set -o pipefail + +case $# in + 1) KIND="$1" + case $KIND in + imports|unittests) ;; + *) echo "Invalid argument: '${KIND}', valid options are 'imports', 'unittests'" + exit 1 + ;; + esac + ;; + *) echo "Usage: $0 imports|unittests" + exit 1 + ;; +esac export PYARROW_TEST_CYTHON=OFF export PYARROW_TEST_DATASET=ON @@ -37,8 +53,9 @@ export PARQUET_TEST_DATA=/arrow/submodules/parquet-testing/data # Install the built wheels pip install /arrow/python/repaired_wheels/*.whl -# Test that the modules are importable -python -c " +if [ "${KIND}" == "imports" ]; then + # Test that the modules are importable + python -c " import pyarrow import pyarrow._hdfs import pyarrow._s3fs @@ -49,11 +66,8 @@ import pyarrow.fs import pyarrow.json import pyarrow.orc import pyarrow.parquet -import pyarrow.plasma -" - -# Install testing dependencies -pip install -r /arrow/python/requirements-wheel-test.txt - -# Execute unittest -pytest -v -r s --pyargs pyarrow +import pyarrow.plasma" +elif [ "${KIND}" == "unittests" ]; then + # Execute unittest, test dependencies must be installed + pytest -r s --pyargs pyarrow +fi diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index f61a2faea0d..18c1b657b21 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -57,7 +57,7 @@ cmake ^ -DARROW_BUILD_TESTS=OFF ^ -DARROW_CXXFLAGS="/MP" ^ -DARROW_DATASET=%ARROW_DATASET% ^ - -DARROW_DEPENDENCY_SOURCE=SYSTEM ^ + -DARROW_DEPENDENCY_SOURCE=VCPKG ^ -DARROW_DEPENDENCY_USE_SHARED=OFF ^ -DARROW_FLIGHT=%ARROW_FLIGHT% ^ -DARROW_GANDIVA=%ARROW_GANDIVA% ^ @@ -76,15 +76,10 @@ cmake ^ -DARROW_WITH_ZLIB=%ARROW_WITH_ZLIB% ^ -DARROW_WITH_ZSTD=%ARROW_WITH_ZSTD% ^ -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ - -DLZ4_MSVC_LIB_PREFIX="" ^ - -DLZ4_MSVC_STATIC_LIB_SUFFIX="" ^ - -DZSTD_MSVC_LIB_PREFIX="" ^ -DCMAKE_CXX_COMPILER=clcache ^ -DCMAKE_INSTALL_PREFIX=C:\arrow-dist ^ - -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^ -DCMAKE_UNITY_BUILD=%CMAKE_UNITY_BUILD% ^ -DMSVC_LINK_VERBOSE=ON ^ - -D_VCPKG_INSTALLED_DIR=C:\vcpkg\installed ^ -DVCPKG_MANIFEST_MODE=OFF ^ -DVCPKG_TARGET_TRIPLET=x64-windows-static-md-%CMAKE_BUILD_TYPE% ^ -G "%CMAKE_GENERATOR%" ^ diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index eacee5f17f3..89963eb2dd8 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -27,7 +27,7 @@ pushd ${source_dir}/tests export TEST_R_WITH_ARROW=TRUE export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" -${R_BIN} < testthat.R > testthat.out 2>&1 +${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } cat testthat.out if grep -q "runtime error" testthat.out; then diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in b/ci/vcpkg/arm64-linux-static-debug.cmake similarity index 68% rename from c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in rename to ci/vcpkg/arm64-linux-static-debug.cmake index ee7e13967df..6fea43694cd 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in +++ b/ci/vcpkg/arm64-linux-static-debug.cmake @@ -15,14 +15,14 @@ # specific language governing permissions and limitations # under the License. -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ +set(VCPKG_TARGET_ARCHITECTURE arm64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) +set(VCPKG_CMAKE_SYSTEM_NAME Linux) +set(VCPKG_BUILD_TYPE debug) -Name: Apache Arrow Dataset GLib -Description: C API for Apache Arrow Dataset based on GLib -Version: @VERSION@ -Libs: -L${libdir} -larrow-dataset-glib -Cflags: -I${includedir} -Requires: arrow-glib arrow-dataset +if(NOT CMAKE_HOST_SYSTEM_PROCESSOR) + execute_process(COMMAND "uname" "-m" + OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() diff --git a/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in b/ci/vcpkg/arm64-linux-static-release.cmake similarity index 68% rename from c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in rename to ci/vcpkg/arm64-linux-static-release.cmake index de0ce974c7a..4012848b849 100644 --- a/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in +++ b/ci/vcpkg/arm64-linux-static-release.cmake @@ -15,14 +15,14 @@ # specific language governing permissions and limitations # under the License. -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ +set(VCPKG_TARGET_ARCHITECTURE arm64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) +set(VCPKG_CMAKE_SYSTEM_NAME Linux) +set(VCPKG_BUILD_TYPE release) -Name: Apache Arrow CUDA GLib -Description: C API for Apache Arrow CUDA based on GLib -Version: @VERSION@ -Libs: -L${libdir} -larrow-cuda-glib -Cflags: -I${includedir} -Requires: arrow-glib arrow-cuda +if(NOT CMAKE_HOST_SYSTEM_PROCESSOR) + execute_process(COMMAND "uname" "-m" + OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 106bf723edc..14b9678690e 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,3 +1,18 @@ +diff --git a/ports/aws-c-common/portfile.cmake b/ports/aws-c-common/portfile.cmake +index f3704ef05..3af543058 100644 +--- a/ports/aws-c-common/portfile.cmake ++++ b/ports/aws-c-common/portfile.cmake +@@ -1,8 +1,8 @@ + vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO awslabs/aws-c-common +- REF 4a21a1c0757083a16497fea27886f5f20ccdf334 # v0.4.56 +- SHA512 68898a8ac15d5490f45676eabfbe0df9e45370a74c543a28909fd0d85fed48dfcf4bcd6ea2d01d1a036dd352e2e4e0b08c48c63ab2a2b477fe150b46a827136e ++ REF 13adef72b7813ec878817c6d50a7a3f241015d8a # v0.4.57 ++ SHA512 28256522ac6af544d7464e3e7dcd4dc802ae2b09728bf8f167f86a6487bb756d0cad5eb4a2480610b2967b9c24c4a7f70621894517aa2828ffdeb0479453803b + HEAD_REF master + PATCHES + disable-error-4068.patch # This patch fixes dependency port compilation failure diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake index 6e18aecd0..2ccecf33c 100644 --- a/ports/curl/portfile.cmake diff --git a/c_glib/autogen.sh b/ci/vcpkg/x64-osx-static-debug.cmake old mode 100755 new mode 100644 similarity index 79% rename from c_glib/autogen.sh rename to ci/vcpkg/x64-osx-static-debug.cmake index eeca380bea8..e8a321ec71a --- a/c_glib/autogen.sh +++ b/ci/vcpkg/x64-osx-static-debug.cmake @@ -1,5 +1,3 @@ -#!/bin/sh -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,10 +15,11 @@ # specific language governing permissions and limitations # under the License. -set -u -set -e +set(VCPKG_TARGET_ARCHITECTURE x64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) -mkdir -p m4 +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES x86_64) -gtkdocize --copy -autoreconf --install --force +set(VCPKG_BUILD_TYPE debug) diff --git a/c_glib/arrow-glib/arrow-orc-glib.pc.in b/ci/vcpkg/x64-osx-static-release.cmake similarity index 79% rename from c_glib/arrow-glib/arrow-orc-glib.pc.in rename to ci/vcpkg/x64-osx-static-release.cmake index 8e45d402549..956d5b92e73 100644 --- a/c_glib/arrow-glib/arrow-orc-glib.pc.in +++ b/ci/vcpkg/x64-osx-static-release.cmake @@ -15,12 +15,11 @@ # specific language governing permissions and limitations # under the License. -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ +set(VCPKG_TARGET_ARCHITECTURE x64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) -Name: Apache Arrow ORC GLib -Description: ORC modules for Apache Arrow GLib -Version: @VERSION@ -Requires: arrow-glib +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES x86_64) + +set(VCPKG_BUILD_TYPE release) diff --git a/cpp/.gitignore b/cpp/.gitignore index a52014264f1..03c03a401a5 100644 --- a/cpp/.gitignore +++ b/cpp/.gitignore @@ -24,6 +24,7 @@ cmake_install.cmake build/ *-build/ Testing/ +build-support/boost_* # Build directories created by Clion cmake-build-*/ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f60469169c1..1705e854fb1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -58,6 +58,14 @@ endif() string(TOLOWER ${CMAKE_BUILD_TYPE} LOWERCASE_BUILD_TYPE) string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") + +# this must be included before the project() command, because of the way +# vcpkg (ab)uses CMAKE_TOOLCHAIN_FILE to inject its logic into CMake +if(ARROW_DEPENDENCY_SOURCE STREQUAL "VCPKG") + include(Usevcpkg) +endif() + project(arrow VERSION "${ARROW_BASE_VERSION}") set(ARROW_VERSION_MAJOR "${arrow_VERSION_MAJOR}") @@ -88,8 +96,6 @@ message(STATUS "Arrow SO version: ${ARROW_SO_VERSION} (full: ${ARROW_FULL_SO_VER set(ARROW_SOURCE_DIR ${PROJECT_SOURCE_DIR}) set(ARROW_BINARY_DIR ${PROJECT_BINARY_DIR}) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") - include(CMakePackageConfigHelpers) include(CMakeParseArguments) include(ExternalProject) @@ -739,7 +745,7 @@ endif() if(ARROW_WITH_RE2) list(APPEND ARROW_LINK_LIBS re2::re2) list(APPEND ARROW_STATIC_LINK_LIBS re2::re2) - if(RE2_SOURCE STREQUAL "SYSTEM") + if(re2_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS re2::re2) endif() endif() diff --git a/cpp/build-support/trim-boost.sh b/cpp/build-support/trim-boost.sh index ebc4ccd7a5d..3e5dd051105 100755 --- a/cpp/build-support/trim-boost.sh +++ b/cpp/build-support/trim-boost.sh @@ -22,23 +22,35 @@ # so that we don't have to download the whole big boost project when we build # boost from source. # -# After running this script, run upload-boost.sh to put the bundle on bintray +# To test building Arrow locally with the boost bundle this creates, add: +# +# set(BOOST_SOURCE_URL /path/to/arrow/cpp/build-support/boost_1_75_0/boost_1_75_0.tar.gz) +# +# to the beginning of the build_boost() macro in ThirdpartyToolchain.cmake, +# +# or set the env var ARROW_BOOST_URL before calling cmake, like: +# +# ARROW_BOOST_URL=/path/to/arrow/cpp/build-support/boost_1_75_0/boost_1_75_0.tar.gz cmake ... +# +# After running this script, upload the bundle to +# https://github.com/ursa-labs/thirdparty/releases/edit/latest +# TODO(ARROW-6407) automate uploading to github set -eu # if version is not defined by the caller, set a default. -: ${BOOST_VERSION:=1.71.0} +: ${BOOST_VERSION:=1.75.0} : ${BOOST_FILE:=boost_${BOOST_VERSION//./_}} -: ${BOOST_URL:=https://dl.bintray.com/boostorg/release/${BOOST_VERSION}/source/${BOOST_FILE}.tar.gz} +: ${BOOST_URL:=https://sourceforge.net/projects/boost/files/boost/${BOOST_VERSION}/${BOOST_FILE}.tar.gz} # Arrow tests require these -BOOST_LIBS="system.hpp filesystem.hpp" +BOOST_LIBS="system.hpp filesystem.hpp process.hpp" # Add these to be able to build those BOOST_LIBS="$BOOST_LIBS config build boost_install headers log predef" -# Gandiva needs these +# Gandiva needs these (and some Arrow tests do too) BOOST_LIBS="$BOOST_LIBS multiprecision/cpp_int.hpp" # These are for Thrift when Thrift_SOURCE=BUNDLED -BOOST_LIBS="$BOOST_LIBS algorithm/string.hpp locale.hpp noncopyable.hpp numeric/conversion/cast.hpp scope_exit.hpp typeof/incr_registration_group.hpp scoped_array.hpp shared_array.hpp tokenizer.hpp version.hpp" +BOOST_LIBS="$BOOST_LIBS locale.hpp scope_exit.hpp boost/typeof/incr_registration_group.hpp" if [ ! -d ${BOOST_FILE} ]; then curl -L "${BOOST_URL}" > ${BOOST_FILE}.tar.gz diff --git a/cpp/build-support/upload-boost.sh b/cpp/build-support/upload-boost.sh deleted file mode 100755 index 65e7e64db77..00000000000 --- a/cpp/build-support/upload-boost.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# This assumes you've just run cpp/build-support/trim-boost.sh, so the file -# to upload is at ${BOOST_FILE}/${BOOST_FILE}.tar.gz -# -# Also, you must have a bintray account on the "ursalabs" organization and -# set the BINTRAY_USER and BINTRAY_APIKEY env vars. -# -# Ensure that the boost tarball is also updated at -# https://github.com/ursa-labs/thirdparty/releases/latest -# TODO(ARROW-6407) automate uploading to github as well. - -set -eu - -# if version is not defined by the caller, set a default. -: ${BOOST_VERSION:=1.71.0} -: ${BOOST_FILE:=boost_${BOOST_VERSION//./_}} -: ${DST_URL:=https://api.bintray.com/content/ursalabs/arrow-boost/arrow-boost/latest} - -if [ "$BINTRAY_USER" = "" ]; then - echo "Must set BINTRAY_USER" - exit 1 -fi -if [ "$BINTRAY_APIKEY" = "" ]; then - echo "Must set BINTRAY_APIKEY" - exit 1 -fi - -upload_file() { - if [ -f "$1" ]; then - echo "PUT ${DST_URL}/$1?override=1&publish=1" - curl -sS -u "${BINTRAY_USER}:${BINTRAY_APIKEY}" -X PUT "${DST_URL}/$1?override=1&publish=1" --data-binary "@$1" - else - echo "$1 not found" - fi -} - -pushd ${BOOST_FILE} -upload_file ${BOOST_FILE}.tar.gz -popd diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index e4df40d61b6..0e92811da8c 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -276,10 +276,11 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") # location, or if you are using a non-standard toolchain, you can also pass # ARROW_PACKAGE_PREFIX to set the *_ROOT variables to look in that # directory - # * CONDA: Same as system but set all *_ROOT variables to + # * CONDA: Same as SYSTEM but set all *_ROOT variables to # ENV{CONDA_PREFIX}. If this is run within an active conda environment, # then ENV{CONDA_PREFIX} will be used for dependencies unless # ARROW_DEPENDENCY_SOURCE is set explicitly to one of the other options + # * VCPKG: Searches for dependencies installed by vcpkg. # * BREW: Use SYSTEM but search for select packages with brew. if(NOT "$ENV{CONDA_PREFIX}" STREQUAL "") set(ARROW_DEPENDENCY_SOURCE_DEFAULT "CONDA") @@ -293,6 +294,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "BUNDLED" "SYSTEM" "CONDA" + "VCPKG" "BREW") define_option(ARROW_VERBOSE_THIRDPARTY_BUILD diff --git a/cpp/cmake_modules/FindBoostAlt.cmake b/cpp/cmake_modules/FindBoostAlt.cmake index 123c6dda1c7..1771937125e 100644 --- a/cpp/cmake_modules/FindBoostAlt.cmake +++ b/cpp/cmake_modules/FindBoostAlt.cmake @@ -38,16 +38,14 @@ if(ARROW_BOOST_USE_SHARED) set(BUILD_SHARED_LIBS_KEEP ${BUILD_SHARED_LIBS}) set(BUILD_SHARED_LIBS ON) - find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} - COMPONENTS system filesystem) + find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} COMPONENTS system filesystem) set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP}) unset(BUILD_SHARED_LIBS_KEEP) else() # Find static boost headers and libs # TODO Differentiate here between release and debug builds set(Boost_USE_STATIC_LIBS ON) - find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} - COMPONENTS system filesystem) + find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} COMPONENTS system filesystem) endif() if(Boost_FOUND) diff --git a/cpp/cmake_modules/FindORC.cmake b/cpp/cmake_modules/FindORC.cmake index 1be149c93b2..061a0df2e9e 100644 --- a/cpp/cmake_modules/FindORC.cmake +++ b/cpp/cmake_modules/FindORC.cmake @@ -44,10 +44,9 @@ if(ORC_STATIC_LIB AND ORC_INCLUDE_DIR) add_library(orc::liborc STATIC IMPORTED) set_target_properties(orc::liborc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${ORC_INCLUDE_DIR}") + INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") else() - if (ORC_FIND_REQUIRED) + if(ORC_FIND_REQUIRED) message(FATAL_ERROR "ORC library was required in toolchain and unable to locate") endif() set(ORC_FOUND FALSE) diff --git a/cpp/cmake_modules/FindSnappy.cmake b/cpp/cmake_modules/FindSnappy.cmake index 5784cf59220..26cccb786c5 100644 --- a/cpp/cmake_modules/FindSnappy.cmake +++ b/cpp/cmake_modules/FindSnappy.cmake @@ -26,9 +26,13 @@ if(ARROW_SNAPPY_USE_SHARED) else() set(SNAPPY_STATIC_LIB_NAME_BASE "snappy") if(MSVC) - set(SNAPPY_STATIC_LIB_NAME_BASE "${SNAPPY_STATIC_LIB_NAME_BASE}${SNAPPY_MSVC_STATIC_LIB_SUFFIX}") + set(SNAPPY_STATIC_LIB_NAME_BASE + "${SNAPPY_STATIC_LIB_NAME_BASE}${SNAPPY_MSVC_STATIC_LIB_SUFFIX}") endif() - set(SNAPPY_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set( + SNAPPY_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) endif() if(Snappy_ROOT) @@ -44,7 +48,9 @@ if(Snappy_ROOT) PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) else() find_library(Snappy_LIB NAMES ${SNAPPY_LIB_NAMES}) - find_path(Snappy_INCLUDE_DIR NAMES snappy.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + find_path(Snappy_INCLUDE_DIR + NAMES snappy.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() find_package_handle_standard_args(Snappy REQUIRED_VARS Snappy_LIB Snappy_INCLUDE_DIR) diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake index 560321df5db..edea73b8dae 100644 --- a/cpp/cmake_modules/Findutf8proc.cmake +++ b/cpp/cmake_modules/Findutf8proc.cmake @@ -29,37 +29,40 @@ else() endif() set(utf8proc_STATIC_LIB_SUFFIX "${utf8proc_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(utf8proc_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${utf8proc_STATIC_LIB_SUFFIX}") + set(utf8proc_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${utf8proc_STATIC_LIB_SUFFIX}") endif() if(utf8proc_ROOT) - find_library( - utf8proc_LIB - NAMES ${utf8proc_LIB_NAMES} - PATHS ${utf8proc_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) + find_library(utf8proc_LIB + NAMES ${utf8proc_LIB_NAMES} + PATHS ${utf8proc_ROOT} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} + NO_DEFAULT_PATH) find_path(utf8proc_INCLUDE_DIR NAMES utf8proc.h PATHS ${utf8proc_ROOT} NO_DEFAULT_PATH PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) else() - find_library( - utf8proc_LIB - NAMES ${utf8proc_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_path(utf8proc_INCLUDE_DIR NAMES utf8proc.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + find_library(utf8proc_LIB + NAMES ${utf8proc_LIB_NAMES} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_path(utf8proc_INCLUDE_DIR + NAMES utf8proc.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() -find_package_handle_standard_args(utf8proc REQUIRED_VARS utf8proc_LIB utf8proc_INCLUDE_DIR) +find_package_handle_standard_args(utf8proc REQUIRED_VARS utf8proc_LIB + utf8proc_INCLUDE_DIR) if(utf8proc_FOUND) set(utf8proc_FOUND TRUE) add_library(utf8proc::utf8proc UNKNOWN IMPORTED) - set_target_properties(utf8proc::utf8proc - PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${utf8proc_INCLUDE_DIR}") + set_target_properties( + utf8proc::utf8proc + PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}" INTERFACE_INCLUDE_DIRECTORIES + "${utf8proc_INCLUDE_DIR}") if(NOT ARROW_UTF8PROC_USE_SHARED) set_target_properties(utf8proc::utf8proc PROPERTIES INTERFACE_COMPILER_DEFINITIONS "UTF8PROC_STATIC") diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index b534552c3c0..9f68c560472 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -451,7 +451,9 @@ if(ARROW_CPU_FLAG STREQUAL "armv8") endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}") - add_definitions(-DARROW_HAVE_NEON) + if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") + add_definitions(-DARROW_HAVE_NEON) + endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index f526631b1aa..05cc642417a 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -69,6 +69,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES Snappy Thrift utf8proc + xsimd ZLIB zstd) @@ -170,6 +171,8 @@ macro(build_dependency DEPENDENCY_NAME) build_thrift() elseif("${DEPENDENCY_NAME}" STREQUAL "utf8proc") build_utf8proc() + elseif("${DEPENDENCY_NAME}" STREQUAL "xsimd") + build_xsimd() elseif("${DEPENDENCY_NAME}" STREQUAL "ZLIB") build_zlib() elseif("${DEPENDENCY_NAME}" STREQUAL "zstd") @@ -247,6 +250,10 @@ include_directories(SYSTEM "${THIRDPARTY_DIR}/flatbuffers/include") # ---------------------------------------------------------------------- # Some EP's require other EP's +if(PARQUET_REQUIRE_ENCRYPTION) + set(ARROW_JSON ON) +endif() + if(ARROW_THRIFT) set(ARROW_WITH_ZLIB ON) endif() @@ -368,7 +375,6 @@ else() AWSSDK_SOURCE_URL "https://github.com/aws/aws-sdk-cpp/archive/${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" "https://github.com/ursa-labs/thirdparty/releases/download/latest/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" - "https://dl.bintray.com/ursalabs/arrow-awssdk/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" ) endif() @@ -381,14 +387,12 @@ else() BOOST_SOURCE_URL # These are trimmed boost bundles we maintain. # See cpp/build-support/trim-boost.sh - "https://dl.bintray.com/ursalabs/arrow-boost/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" - "https://dl.bintray.com/boostorg/release/${ARROW_BOOST_BUILD_VERSION}/source/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" - "https://github.com/boostorg/boost/archive/boost-${ARROW_BOOST_BUILD_VERSION}.tar.gz" # FIXME(ARROW-6407) automate uploading this archive to ensure it reflects # our currently used packages and doesn't fall out of sync with # ${ARROW_BOOST_BUILD_VERSION_UNDERSCORES} "https://github.com/ursa-labs/thirdparty/releases/download/latest/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" - ) + "https://sourceforge.net/projects/boost/files/boost/${ARROW_BOOST_BUILD_VERSION}/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" + "https://github.com/boostorg/boost/archive/boost-${ARROW_BOOST_BUILD_VERSION}.tar.gz") endif() if(DEFINED ENV{ARROW_BROTLI_URL}) @@ -401,6 +405,16 @@ else() ) endif() +if(DEFINED ENV{ARROW_BZIP2_URL}) + set(ARROW_BZIP2_SOURCE_URL "$ENV{ARROW_BZIP2_URL}") +else() + set_urls( + ARROW_BZIP2_SOURCE_URL + "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_CARES_URL}) set(CARES_SOURCE_URL "$ENV{ARROW_CARES_URL}") else() @@ -459,7 +473,6 @@ else() "https://github.com/google/googletest/archive/release-${ARROW_GTEST_BUILD_VERSION}.tar.gz" "https://chromium.googlesource.com/external/github.com/google/googletest/+archive/release-${ARROW_GTEST_BUILD_VERSION}.tar.gz" "https://github.com/ursa-labs/thirdparty/releases/download/latest/gtest-${ARROW_GTEST_BUILD_VERSION}.tar.gz" - "https://dl.bintray.com/ursalabs/arrow-gtest/gtest-${ARROW_GTEST_BUILD_VERSION}.tar.gz" ) endif() @@ -565,53 +578,54 @@ else() "https://mirrors.sonic.net/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" "https://us.mirrors.quenda.co/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" "https://github.com/ursa-labs/thirdparty/releases/download/latest/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://dl.bintray.com/ursalabs/arrow-thrift/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" ) endif() -if(DEFINED ENV{ARROW_ZLIB_URL}) - set(ZLIB_SOURCE_URL "$ENV{ARROW_ZLIB_URL}") +if(DEFINED ENV{ARROW_UTF8PROC_URL}) + set(ARROW_UTF8PROC_SOURCE_URL "$ENV{ARROW_UTF8PROC_URL}") else() set_urls( - ZLIB_SOURCE_URL "https://zlib.net/fossils/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" + ARROW_UTF8PROC_SOURCE_URL + "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz" ) endif() -if(DEFINED ENV{ARROW_ZSTD_URL}) - set(ZSTD_SOURCE_URL "$ENV{ARROW_ZSTD_URL}") +if(DEFINED ENV{ARROW_XSIMD_URL}) + set(XSIMD_SOURCE_URL "$ENV{ARROW_XSIMD_URL}") else() set_urls( - ZSTD_SOURCE_URL - "https://github.com/facebook/zstd/archive/${ARROW_ZSTD_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/zstd-${ARROW_ZSTD_BUILD_VERSION}.tar.gz" - ) + XSIMD_SOURCE_URL + "https://github.com/xtensor-stack/xsimd/archive/${ARROW_XSIMD_BUILD_VERSION}.tar.gz") endif() -if(DEFINED ENV{ARROW_BZIP2_SOURCE_URL}) - set(ARROW_BZIP2_SOURCE_URL "$ENV{ARROW_BZIP2_SOURCE_URL}") +if(DEFINED ENV{ARROW_ZLIB_URL}) + set(ZLIB_SOURCE_URL "$ENV{ARROW_ZLIB_URL}") else() set_urls( - ARROW_BZIP2_SOURCE_URL - "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" + ZLIB_SOURCE_URL "https://zlib.net/fossils/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" ) endif() -if(DEFINED ENV{ARROW_UTF8PROC_SOURCE_URL}) - set(ARROW_UTF8PROC_SOURCE_URL "$ENV{ARROW_UTF8PROC_SOURCE_URL}") +if(DEFINED ENV{ARROW_ZSTD_URL}) + set(ZSTD_SOURCE_URL "$ENV{ARROW_ZSTD_URL}") else() set_urls( - ARROW_UTF8PROC_SOURCE_URL - "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz" + ZSTD_SOURCE_URL + "https://github.com/facebook/zstd/archive/${ARROW_ZSTD_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/zstd-${ARROW_ZSTD_BUILD_VERSION}.tar.gz" ) endif() # ---------------------------------------------------------------------- # ExternalProject options -set(EP_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}") -set(EP_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") +set( + EP_CXX_FLAGS + "${CMAKE_CXX_COMPILER_ARG1} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}" + ) +set(EP_C_FLAGS + "${CMAKE_C_COMPILER_ARG1} ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") if(NOT MSVC_TOOLCHAIN) # Set -fPIC on all external projects @@ -713,7 +727,7 @@ macro(build_boost) set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh") endif() - set(BOOST_BUILD_WITH_LIBRARIES "filesystem" "regex" "system") + set(BOOST_BUILD_WITH_LIBRARIES "filesystem" "system") string(REPLACE ";" "," BOOST_CONFIGURE_LIBRARIES "${BOOST_BUILD_WITH_LIBRARIES}") list(APPEND BOOST_CONFIGURE_COMMAND "--prefix=${BOOST_PREFIX}" "--with-libraries=${BOOST_CONFIGURE_LIBRARIES}") @@ -754,10 +768,6 @@ macro(build_boost) BOOST_STATIC_FILESYSTEM_LIBRARY "${BOOST_LIB_DIR}/libboost_filesystem${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) - set( - BOOST_STATIC_REGEX_LIBRARY - "${BOOST_LIB_DIR}/libboost_regex${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) set(BOOST_BUILD_PRODUCTS ${BOOST_STATIC_SYSTEM_LIBRARY} @@ -767,8 +777,6 @@ macro(build_boost) add_thirdparty_lib(boost_filesystem STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}") - add_thirdparty_lib(boost_regex STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}") - externalproject_add(boost_ep URL ${BOOST_SOURCE_URL} BUILD_BYPRODUCTS ${BOOST_BUILD_PRODUCTS} @@ -776,8 +784,7 @@ macro(build_boost) CONFIGURE_COMMAND ${BOOST_CONFIGURE_COMMAND} BUILD_COMMAND ${BOOST_BUILD_COMMAND} INSTALL_COMMAND "" ${EP_LOG_OPTIONS}) - list(APPEND ARROW_BUNDLED_STATIC_LIBS boost_system_static boost_filesystem_static - boost_regex_static) + list(APPEND ARROW_BUNDLED_STATIC_LIBS boost_system_static boost_filesystem_static) else() externalproject_add(boost_ep ${EP_LOG_OPTIONS} @@ -803,6 +810,8 @@ if(MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() set(Boost_ADDITIONAL_VERSIONS + "1.75.0" + "1.75" "1.74.0" "1.74" "1.73.0" @@ -1227,6 +1236,10 @@ endif() # Thrift macro(build_thrift) + if(CMAKE_VERSION VERSION_LESS 3.10) + message( + FATAL_ERROR "Building thrift using ExternalProject requires at least CMake 3.10") + endif() message("Building Apache Thrift from source") set(THRIFT_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/thrift_ep-install") set(THRIFT_INCLUDE_DIR "${THRIFT_PREFIX}/include") @@ -1236,8 +1249,6 @@ macro(build_thrift) "-DCMAKE_INSTALL_RPATH=${THRIFT_PREFIX}/lib" -DBUILD_COMPILER=OFF -DBUILD_SHARED_LIBS=OFF - # DWITH_SHARED_LIB is removed in 0.13 - -DWITH_SHARED_LIB=OFF -DBUILD_TESTING=OFF -DBUILD_EXAMPLES=OFF -DBUILD_TUTORIALS=OFF @@ -1634,6 +1645,10 @@ macro(build_gtest) GTEST_MAIN_SHARED_LIB "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${_GTEST_LIBRARY_SUFFIX}" ) + set(GTEST_INSTALL_NAME_DIR "$/lib") + # Fix syntax highlighting mess introduced by unclosed bracket above + set(dummy ">") + set(GTEST_CMAKE_ARGS ${EP_COMMON_TOOLCHAIN} -DBUILD_SHARED_LIBS=ON @@ -1641,7 +1656,7 @@ macro(build_gtest) -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${GTEST_CMAKE_CXX_FLAGS} -DCMAKE_INSTALL_LIBDIR=lib - -DCMAKE_INSTALL_NAME_DIR=$/lib + -DCMAKE_INSTALL_NAME_DIR=${GTEST_INSTALL_NAME_DIR} -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} -DCMAKE_MACOSX_RPATH=OFF) set(GMOCK_INCLUDE_DIR "${GTEST_PREFIX}/include") @@ -1904,6 +1919,33 @@ if(ARROW_WITH_RAPIDJSON) include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) endif() +macro(build_xsimd) + message(STATUS "Building xsimd from source") + set(XSIMD_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install") + set(XSIMD_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${XSIMD_PREFIX}") + + externalproject_add(xsimd_ep + ${EP_LOG_OPTIONS} + PREFIX "${CMAKE_BINARY_DIR}" + URL ${XSIMD_SOURCE_URL} + CMAKE_ARGS ${XSIMD_CMAKE_ARGS}) + + set(XSIMD_INCLUDE_DIR "${XSIMD_PREFIX}/include") + + add_dependencies(toolchain xsimd_ep) + add_dependencies(toolchain-tests xsimd_ep) + + set(XSIMD_VENDORED TRUE) +endmacro() + +# For now xsimd is always bundled from upstream +if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") + set(xsimd_SOURCE "BUNDLED") + resolve_dependency(xsimd) + # TODO: Don't use global includes but rather target_include_directories + include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) +endif() + macro(build_zlib) message(STATUS "Building ZLIB from source") set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") @@ -2146,6 +2188,10 @@ macro(build_bzip2) set(BZIP2_EXTRA_ARGS "CC=${CMAKE_C_COMPILER}" "CFLAGS=${EP_C_FLAGS}") + if(CMAKE_OSX_SYSROOT) + list(APPEND BZIP2_EXTRA_ARGS "SDKROOT=${CMAKE_OSX_SYSROOT}") + endif() + externalproject_add(bzip2_ep ${EP_LOG_OPTIONS} CONFIGURE_COMMAND "" @@ -2329,30 +2375,30 @@ macro(build_grpc) set(ABSL_LIBRARIES) # Abseil libraries gRPC depends on + # Follows grpc++ package config template for link order of libraries + # https://github.com/grpc/grpc/blob/v1.35.0/CMakeLists.txt#L16361 set(_ABSL_LIBS - bad_optional_access - base + statusor + status cord + str_format_internal + synchronization graphcycles_internal - int128 - malloc_internal - raw_logging_internal - spinlock_wait + symbolize + demangle_internal stacktrace - status - statusor - str_format_internal + debugging_internal + malloc_internal + time + time_zone strings strings_internal - symbolize - # symbolize depends on debugging_internal - debugging_internal - # debugging_internal depends on demangle_internal - demangle_internal - synchronization throw_delegate - time - time_zone) + int128 + base + spinlock_wait + bad_optional_access + raw_logging_internal) foreach(_ABSL_LIB ${_ABSL_LIBS}) set( @@ -2516,12 +2562,13 @@ macro(build_grpc) add_library(gRPC::grpc++ STATIC IMPORTED) set_target_properties( gRPC::grpc++ - PROPERTIES IMPORTED_LOCATION - "${GRPC_STATIC_LIBRARY_GRPCPP}" - INTERFACE_LINK_LIBRARIES - "gRPC::grpc;gRPC::gpr;gRPC::upb;gRPC::address_sorting;${ABSL_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES - "${GRPC_INCLUDE_DIR}") + PROPERTIES + IMPORTED_LOCATION + "${GRPC_STATIC_LIBRARY_GRPCPP}" + INTERFACE_LINK_LIBRARIES + "gRPC::grpc;gRPC::gpr;gRPC::upb;gRPC::address_sorting;${ABSL_LIBRARIES};Threads::Threads" + INTERFACE_INCLUDE_DIRECTORIES + "${GRPC_INCLUDE_DIR}") add_executable(gRPC::grpc_cpp_plugin IMPORTED) set_target_properties(gRPC::grpc_cpp_plugin diff --git a/cpp/cmake_modules/Usevcpkg.cmake b/cpp/cmake_modules/Usevcpkg.cmake new file mode 100644 index 00000000000..781bec436f3 --- /dev/null +++ b/cpp/cmake_modules/Usevcpkg.cmake @@ -0,0 +1,217 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +message(STATUS "Using vcpkg to find dependencies") + +# ---------------------------------------------------------------------- +# Define macros + +# macro to list subdirectirectories (non-recursive) +macro(list_subdirs SUBDIRS DIR) + file(GLOB children_ RELATIVE ${DIR} ${DIR}/*) + set(subdirs_ "") + foreach(child_ ${children_}) + if(IS_DIRECTORY "${DIR}/${child_}") + list(APPEND subdirs_ ${child_}) + endif() + endforeach() + set("${SUBDIRS}" ${subdirs_}) + unset(children_) + unset(subdirs_) +endmacro() + +# ---------------------------------------------------------------------- +# Get VCPKG_ROOT + +if(DEFINED CMAKE_TOOLCHAIN_FILE) + # Get it from the CMake variable CMAKE_TOOLCHAIN_FILE + get_filename_component(_VCPKG_DOT_CMAKE "${CMAKE_TOOLCHAIN_FILE}" NAME) + if(EXISTS "${CMAKE_TOOLCHAIN_FILE}" AND _VCPKG_DOT_CMAKE STREQUAL "vcpkg.cmake") + get_filename_component(_VCPKG_BUILDSYSTEMS_DIR "${CMAKE_TOOLCHAIN_FILE}" DIRECTORY) + get_filename_component(VCPKG_ROOT "${_VCPKG_BUILDSYSTEMS_DIR}/../.." ABSOLUTE) + else() + message( + FATAL_ERROR + "vcpkg toolchain file not found at path specified in -DCMAKE_TOOLCHAIN_FILE") + endif() +else() + if(DEFINED VCPKG_ROOT) + # Get it from the CMake variable VCPKG_ROOT + find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH) + if(NOT _VCPKG_BIN) + message(FATAL_ERROR "vcpkg not found in directory specified in -DVCPKG_ROOT") + endif() + elseif(DEFINED ENV{VCPKG_ROOT}) + # Get it from the environment variable VCPKG_ROOT + set(VCPKG_ROOT $ENV{VCPKG_ROOT}) + find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH) + if(NOT _VCPKG_BIN) + message( + FATAL_ERROR "vcpkg not found in directory in environment variable VCPKG_ROOT") + endif() + else() + # Get it from the file vcpkg.path.txt + find_program(_VCPKG_BIN vcpkg) + if(_VCPKG_BIN) + get_filename_component(_VCPKG_REAL_BIN "${_VCPKG_BIN}" REALPATH) + get_filename_component(VCPKG_ROOT "${_VCPKG_REAL_BIN}" DIRECTORY) + else() + if(CMAKE_HOST_WIN32) + set(_VCPKG_PATH_TXT "$ENV{LOCALAPPDATA}/vcpkg/vcpkg.path.txt") + else() + set(_VCPKG_PATH_TXT "$ENV{HOME}/.vcpkg/vcpkg.path.txt") + endif() + if(EXISTS "${_VCPKG_PATH_TXT}") + file(READ "${_VCPKG_PATH_TXT}" VCPKG_ROOT) + else() + message( + FATAL_ERROR + "vcpkg not found. Install vcpkg if not installed, " + "then run vcpkg integrate install or set environment variable VCPKG_ROOT.") + endif() + find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH) + if(NOT _VCPKG_BIN) + message(FATAL_ERROR "vcpkg not found. Re-run vcpkg integrate install " + "or set environment variable VCPKG_ROOT.") + endif() + endif() + endif() + set(CMAKE_TOOLCHAIN_FILE + "${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" + CACHE FILEPATH "Path to vcpkg CMake toolchain file") +endif() +message(STATUS "Using CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") +message(STATUS "Using VCPKG_ROOT: ${VCPKG_ROOT}") + +# ---------------------------------------------------------------------- +# Get VCPKG_TARGET_TRIPLET + +if(DEFINED ENV{VCPKG_DEFAULT_TRIPLET} AND NOT DEFINED VCPKG_TARGET_TRIPLET) + set(VCPKG_TARGET_TRIPLET "$ENV{VCPKG_DEFAULT_TRIPLET}") +endif() +# Explicitly set manifest mode on if it is not set and vcpkg.json exists +if(NOT DEFINED VCPKG_MANIFEST_MODE AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/vcpkg.json") + set(VCPKG_MANIFEST_MODE ON CACHE BOOL "Use vcpkg.json manifest") + message(STATUS "vcpkg.json manifest found. Using VCPKG_MANIFEST_MODE: ON") +endif() +# vcpkg can install packages in three different places +set(_INST_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed") # try here first +set(_INST_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/vcpkg_installed") # try here second +set(_INST_VCPKG_ROOT "${VCPKG_ROOT}/installed") +# Iterate over the places +foreach(_INST_DIR + IN + LISTS + _INST_BUILD_DIR + _INST_SOURCE_DIR + _INST_VCPKG_ROOT + "notfound") + if(_INST_DIR STREQUAL "notfound") + message(FATAL_ERROR "vcpkg installed libraries directory not found. " + "Install packages with vcpkg before executing cmake.") + elseif(NOT EXISTS "${_INST_DIR}") + continue() + elseif((_INST_DIR STREQUAL _INST_BUILD_DIR OR _INST_DIR STREQUAL _INST_SOURCE_DIR) + AND NOT VCPKG_MANIFEST_MODE) + # Do not look for packages in the build or source dirs if manifest mode is off + message(STATUS "Skipped looking for installed packages in ${_INST_DIR} " + "because -DVCPKG_MANIFEST_MODE=OFF") + continue() + else() + message(STATUS "Looking for installed packages in ${_INST_DIR}") + endif() + if(DEFINED VCPKG_TARGET_TRIPLET) + # Check if a subdirectory named VCPKG_TARGET_TRIPLET + # exists in the vcpkg installed directory + if(EXISTS "${_INST_DIR}/${VCPKG_TARGET_TRIPLET}") + set(_VCPKG_INSTALLED_DIR "${_INST_DIR}") + break() + endif() + else() + # Infer VCPKG_TARGET_TRIPLET from the name of the + # subdirectory in the vcpkg installed directory + list_subdirs(_VCPKG_TRIPLET_SUBDIRS "${_INST_DIR}") + list(REMOVE_ITEM _VCPKG_TRIPLET_SUBDIRS "vcpkg") + list(LENGTH _VCPKG_TRIPLET_SUBDIRS _NUM_VCPKG_TRIPLET_SUBDIRS) + if(_NUM_VCPKG_TRIPLET_SUBDIRS EQUAL 1) + list(GET _VCPKG_TRIPLET_SUBDIRS 0 VCPKG_TARGET_TRIPLET) + set(_VCPKG_INSTALLED_DIR "${_INST_DIR}") + break() + endif() + endif() +endforeach() +if(NOT DEFINED VCPKG_TARGET_TRIPLET) + message(FATAL_ERROR "Could not infer VCPKG_TARGET_TRIPLET. " + "Specify triplet with -DVCPKG_TARGET_TRIPLET.") +elseif(NOT DEFINED _VCPKG_INSTALLED_DIR) + message( + FATAL_ERROR + "Could not find installed vcpkg packages for triplet ${VCPKG_TARGET_TRIPLET}. " + "Install packages with vcpkg before executing cmake.") +endif() + +set(VCPKG_TARGET_TRIPLET + "${VCPKG_TARGET_TRIPLET}" + CACHE STRING "vcpkg triplet for the target environment") + +if(NOT DEFINED VCPKG_BUILD_TYPE) + set(VCPKG_BUILD_TYPE + "${LOWERCASE_BUILD_TYPE}" + CACHE STRING "vcpkg build type (release|debug)") +endif() + +if(NOT DEFINED VCPKG_LIBRARY_LINKAGE) + if(ARROW_DEPENDENCY_USE_SHARED) + set(VCPKG_LIBRARY_LINKAGE "dynamic") + else() + set(VCPKG_LIBRARY_LINKAGE "static") + endif() + set(VCPKG_LIBRARY_LINKAGE + "${VCPKG_LIBRARY_LINKAGE}" + CACHE STRING "vcpkg preferred library linkage (static|dynamic)") +endif() + +message(STATUS "Using vcpkg installed libraries directory: ${_VCPKG_INSTALLED_DIR}") +message(STATUS "Using VCPKG_TARGET_TRIPLET: ${VCPKG_TARGET_TRIPLET}") +message(STATUS "Using VCPKG_BUILD_TYPE: ${VCPKG_BUILD_TYPE}") +message(STATUS "Using VCPKG_LIBRARY_LINKAGE: ${VCPKG_LIBRARY_LINKAGE}") + +set(ARROW_VCPKG_PREFIX + "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}" + CACHE PATH "Path to target triplet subdirectory in vcpkg installed directory") + +set(ARROW_VCPKG ON CACHE BOOL "Use vcpkg for dependencies") + +set(ARROW_DEPENDENCY_SOURCE + "SYSTEM" + CACHE STRING "The specified value VCPKG is implemented internally as SYSTEM" FORCE) + +set(BOOST_ROOT "${ARROW_VCPKG_PREFIX}" CACHE STRING "") +set(BOOST_INCLUDEDIR "${ARROW_VCPKG_PREFIX}/include/boost" CACHE STRING "") +set(BOOST_LIBRARYDIR "${ARROW_VCPKG_PREFIX}/lib" CACHE STRING "") +set(OPENSSL_INCLUDE_DIR "${ARROW_VCPKG_PREFIX}/include" CACHE STRING "") +set(OPENSSL_LIBRARIES "${ARROW_VCPKG_PREFIX}/lib" CACHE STRING "") +set(OPENSSL_ROOT_DIR "${ARROW_VCPKG_PREFIX}" CACHE STRING "") +set(Thrift_ROOT "${ARROW_VCPKG_PREFIX}/lib" CACHE STRING "") +set(ZSTD_INCLUDE_DIR "${ARROW_VCPKG_PREFIX}/include" CACHE STRING "") +set(ZSTD_ROOT "${ARROW_VCPKG_PREFIX}" CACHE STRING "") + +if(CMAKE_HOST_WIN32) + set(LZ4_MSVC_LIB_PREFIX "" CACHE STRING "") + set(LZ4_MSVC_STATIC_LIB_SUFFIX "" CACHE STRING "") + set(ZSTD_MSVC_LIB_PREFIX "" CACHE STRING "") +endif() diff --git a/cpp/examples/minimal_build/example.cc b/cpp/examples/minimal_build/example.cc index 8f58de5777a..e1b5c123a85 100644 --- a/cpp/examples/minimal_build/example.cc +++ b/cpp/examples/minimal_build/example.cc @@ -39,7 +39,7 @@ Status RunMain(int argc, char** argv) { ARROW_ASSIGN_OR_RAISE( auto csv_reader, arrow::csv::TableReader::Make(arrow::default_memory_pool(), - arrow::io::AsyncContext(), + arrow::io::default_io_context(), input_file, arrow::csv::ReadOptions::Defaults(), arrow::csv::ParseOptions::Defaults(), diff --git a/cpp/src/arrow/ArrowConfig.cmake.in b/cpp/src/arrow/ArrowConfig.cmake.in index 0cf8a85f570..6209baeec67 100644 --- a/cpp/src/arrow/ArrowConfig.cmake.in +++ b/cpp/src/arrow/ArrowConfig.cmake.in @@ -37,6 +37,7 @@ set(ARROW_FULL_SO_VERSION "@ARROW_FULL_SO_VERSION@") set(ARROW_LIBRARY_PATH_SUFFIXES "@ARROW_LIBRARY_PATH_SUFFIXES@") set(ARROW_INCLUDE_PATH_SUFFIXES "@ARROW_INCLUDE_PATH_SUFFIXES@") set(ARROW_SYSTEM_DEPENDENCIES "@ARROW_SYSTEM_DEPENDENCIES@") +set(ARROW_BUNDLED_STATIC_LIBS "@ARROW_BUNDLED_STATIC_LIBS@") include("${CMAKE_CURRENT_LIST_DIR}/ArrowOptions.cmake") @@ -71,19 +72,21 @@ if(NOT (TARGET arrow_shared OR TARGET arrow_static)) get_property(arrow_static_loc TARGET arrow_static PROPERTY LOCATION) get_filename_component(arrow_lib_dir ${arrow_static_loc} DIRECTORY) - add_library(arrow_bundled_dependencies STATIC IMPORTED) - set_target_properties( - arrow_bundled_dependencies - PROPERTIES - IMPORTED_LOCATION - "${arrow_lib_dir}/${CMAKE_STATIC_LIBRARY_PREFIX}arrow_bundled_dependencies${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + if(ARROW_BUNDLED_STATIC_LIBS) + add_library(arrow_bundled_dependencies STATIC IMPORTED) + set_target_properties( + arrow_bundled_dependencies + PROPERTIES + IMPORTED_LOCATION + "${arrow_lib_dir}/${CMAKE_STATIC_LIBRARY_PREFIX}arrow_bundled_dependencies${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) - get_property(arrow_static_interface_link_libraries - TARGET arrow_static - PROPERTY INTERFACE_LINK_LIBRARIES) - set_target_properties( - arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES - "${arrow_static_interface_link_libraries};arrow_bundled_dependencies") + get_property(arrow_static_interface_link_libraries + TARGET arrow_static + PROPERTY INTERFACE_LINK_LIBRARIES) + set_target_properties( + arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES + "${arrow_static_interface_link_libraries};arrow_bundled_dependencies") + endif() endif() endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 382a851c159..df72dcc5b6b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -181,6 +181,7 @@ set(ARROW_SRCS util/bitmap_builders.cc util/bitmap_ops.cc util/bpacking.cc + util/cancel.cc util/compression.cc util/cpu_info.cc util/decimal.cc @@ -349,6 +350,9 @@ if(ARROW_CSV) csv/options.cc csv/parser.cc csv/reader.cc) + if(ARROW_COMPUTE) + list(APPEND ARROW_SRCS csv/writer.cc) + endif() list(APPEND ARROW_TESTING_SRCS csv/test_common.cc) endif() @@ -369,6 +373,7 @@ if(ARROW_COMPUTE) compute/kernels/aggregate_tdigest.cc compute/kernels/aggregate_var_std.cc compute/kernels/codegen_internal.cc + compute/kernels/hash_aggregate.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc compute/kernels/scalar_cast_boolean.cc @@ -414,7 +419,6 @@ if(ARROW_FILESYSTEM) filesystem/filesystem.cc filesystem/localfs.cc filesystem/mockfs.cc - filesystem/path_forest.cc filesystem/path_util.cc filesystem/util_internal.cc) @@ -455,6 +459,8 @@ if(ARROW_JSON) json/chunked_builder.cc json/chunker.cc json/converter.cc + json/object_parser.cc + json/object_writer.cc json/parser.cc json/reader.cc) endif() diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index d3ae93318ba..db3c640b9a4 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -117,13 +117,13 @@ class BaseBinaryArray : public FlatArray { } } - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: // For subclasses - BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} + BaseBinaryArray() = default; // Protected method for constructors void SetData(const std::shared_ptr& data) { @@ -132,8 +132,8 @@ class BaseBinaryArray : public FlatArray { raw_data_ = data->GetValuesSafe(2, /*offset=*/0); } - const offset_type* raw_value_offsets_; - const uint8_t* raw_data_; + const offset_type* raw_value_offsets_ = NULLPTR; + const uint8_t* raw_data_ = NULLPTR; }; /// Concrete Array class for variable-size binary data @@ -231,9 +231,9 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: void SetData(const std::shared_ptr& data) { diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 1696653850b..a50cbcc13cf 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -20,6 +20,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -197,10 +198,11 @@ class TestListArray : public TestBuilder { } void TestFromArrays() { - std::shared_ptr offsets1, offsets2, offsets3, offsets4, values; + std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; std::vector offsets_is_valid3 = {true, false, true, true}; std::vector offsets_is_valid4 = {true, true, false, true}; + std::vector offsets_is_valid5 = {true, true, false, false}; std::vector values_is_valid = {true, false, true, true, true, true}; @@ -217,6 +219,8 @@ class TestListArray : public TestBuilder { &offsets3); ArrayFromVector(offsets_is_valid4, offset2_values, &offsets4); + ArrayFromVector(offsets_is_valid5, offset2_values, + &offsets5); ArrayFromVector(values_is_valid, values_values, &values); @@ -254,6 +258,28 @@ class TestListArray : public TestBuilder { // Offsets not the right type ASSERT_RAISES(TypeError, ArrayType::FromArrays(*values, *offsets1, pool_)); + + // Null final offset + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Last list offset should be non-null"), + ArrayType::FromArrays(*offsets5, *values, pool_)); + + // ARROW-12077: check for off-by-one in construction (need mimalloc/ASan/Valgrind) + { + std::shared_ptr offsets, values; + // Length multiple of 8 - we'll allocate a validity buffer with exactly enough bits + // (Need a large enough buffer or else ASan doesn't catch it) + std::vector offsets_is_valid(4096); + std::vector offset_values(4096); + std::vector values_values(4096); + std::fill(offsets_is_valid.begin(), offsets_is_valid.end(), true); + offsets_is_valid[1] = false; + std::fill(offset_values.begin(), offset_values.end(), 0); + std::fill(values_values.begin(), values_values.end(), 0); + ArrayFromVector(offsets_is_valid, offset_values, &offsets); + ArrayFromVector(values_values, &values); + ASSERT_OK_AND_ASSIGN(auto list, ArrayType::FromArrays(*offsets, *values, pool_)); + } } void TestAppendNull() { diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 97bbb18696c..f967127c5f1 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -70,12 +70,11 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool, ARROW_ASSIGN_OR_RAISE(auto clean_offsets, AllocateBuffer(num_offsets * sizeof(offset_type), pool)); - // Copy valid bits, zero out the bit for the final offset - // XXX why? + // Copy valid bits, ignoring the final offset (since for a length N list array, + // we have N + 1 offsets) ARROW_ASSIGN_OR_RAISE( auto clean_valid_bits, offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1))); - BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets); *validity_buf_out = clean_valid_bits; const offset_type* raw_offsets = typed_offsets.raw_values(); diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index f9ac60f6cb9..b601eb770c3 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -64,9 +64,9 @@ class NumericArray : public PrimitiveArray { // For API compatibility with BinaryArray etc. value_type GetView(int64_t i) const { return Value(i); } - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: using PrimitiveArray::PrimitiveArray; @@ -99,9 +99,9 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { /// values. Result is not cached. int64_t true_count() const; - IteratorType begin() { return IteratorType(*this); } + IteratorType begin() const { return IteratorType(*this); } - IteratorType end() { return IteratorType(*this, length()); } + IteratorType end() const { return IteratorType(*this, length()); } protected: using PrimitiveArray::PrimitiveArray; diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index 1eb722b13c5..88d25e823bb 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -152,7 +152,8 @@ class TestUnionArrayFactories : public ::testing::Test { TEST_F(TestUnionArrayFactories, TestMakeDense) { std::shared_ptr value_offsets; - ArrayFromVector({1, 0, 0, 0, 1, 0, 1, 2, 1, 2}, &value_offsets); + // type_ids_: {0, 1, 2, 0, 1, 3, 2, 0, 2, 1} + ArrayFromVector({0, 0, 0, 1, 1, 0, 1, 2, 1, 2}, &value_offsets); auto children = std::vector>(4); ArrayFromVector({"abc", "def", "xyz"}, &children[0]); @@ -208,12 +209,19 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) { ASSERT_RAISES(Invalid, result->ValidateFull()); // Invalid offsets + // - offset out of bounds at index 5 std::shared_ptr invalid_offsets; - ArrayFromVector({1, 0, 0, 0, 1, 1, 1, 2, 1, 2}, &invalid_offsets); + ArrayFromVector({0, 0, 0, 1, 1, 1, 1, 2, 1, 2}, &invalid_offsets); ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); ASSERT_RAISES(Invalid, result->ValidateFull()); - ArrayFromVector({1, 0, 0, 0, 1, -1, 1, 2, 1, 2}, &invalid_offsets); + // - negative offset at index 5 + ArrayFromVector({0, 0, 0, 1, 1, -1, 1, 2, 1, 2}, &invalid_offsets); + ASSERT_OK_AND_ASSIGN(result, + DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); + ASSERT_RAISES(Invalid, result->ValidateFull()); + // - non-monotonic offset at index 3 + ArrayFromVector({1, 0, 0, 0, 1, 0, 1, 2, 1, 2}, &invalid_offsets); ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); ASSERT_RAISES(Invalid, result->ValidateFull()); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 38092045aab..6ac885f8443 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -180,7 +180,7 @@ struct ValidateArrayImpl { bool IsBufferValid(int index) { return IsBufferValid(data, index); } static bool IsBufferValid(const ArrayData& data, int index) { - return data.buffers[index] != nullptr && data.buffers[index]->data() != nullptr; + return data.buffers[index] != nullptr && data.buffers[index]->address() != 0; } template @@ -527,6 +527,7 @@ struct ValidateArrayFullImpl { } // Check offsets are in bounds + std::vector last_child_offsets(256, 0); const int32_t* offsets = data.GetValues(2); for (int64_t i = 0; i < data.length; ++i) { const int32_t code = type_codes[i]; @@ -541,6 +542,11 @@ struct ValidateArrayFullImpl { "than child length (", offset, " >= ", child_lengths[code], ")"); } + if (offset < last_child_offsets[code]) { + return Status::Invalid("Union value at position ", i, + " has non-monotonic offset ", offset); + } + last_child_offsets[code] = offset; } } diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 41a47c91729..f525ec23c58 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -162,6 +162,12 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + void Reset() { buffer_ = NULLPTR; capacity_ = size_ = 0; @@ -202,6 +208,11 @@ class TypedBufferBuilder< MemoryPool* pool = default_memory_pool()) : bytes_builder_(std::move(buffer), pool) {} + explicit TypedBufferBuilder(BufferBuilder builder) + : bytes_builder_(std::move(builder)) {} + + BufferBuilder* bytes_builder() { return &bytes_builder_; } + Status Append(T value) { return bytes_builder_.Append(reinterpret_cast(&value), sizeof(T)); } @@ -256,6 +267,12 @@ class TypedBufferBuilder< return bytes_builder_.Finish(out, shrink_to_fit); } + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + void Reset() { bytes_builder_.Reset(); } int64_t length() const { return bytes_builder_.length() / sizeof(T); } @@ -274,6 +291,11 @@ class TypedBufferBuilder { explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool()) : bytes_builder_(pool) {} + explicit TypedBufferBuilder(BufferBuilder builder) + : bytes_builder_(std::move(builder)) {} + + BufferBuilder* bytes_builder() { return &bytes_builder_; } + Status Append(bool value) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(value); @@ -371,6 +393,12 @@ class TypedBufferBuilder { return bytes_builder_.Finish(out, shrink_to_fit); } + Result> Finish(bool shrink_to_fit = true) { + std::shared_ptr out; + ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); + return out; + } + void Reset() { bytes_builder_.Reset(); bit_length_ = false_count_ = 0; diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 387105de9e7..6769b23867b 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -71,7 +71,7 @@ class EqualOptions { return res; } - static EqualOptions Defaults() { return EqualOptions(); } + static EqualOptions Defaults() { return {}; } protected: double atol_ = kDefaultAbsoluteTolerance; diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index eef1587bb73..ca118ec5678 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -306,5 +306,102 @@ Result TDigest(const Datum& value, const TDigestOptions& options = TDigestOptions::Defaults(), ExecContext* ctx = NULLPTR); +namespace internal { + +/// Internal use only: streaming group identifier. +/// Consumes batches of keys and yields batches of the group ids. +class ARROW_EXPORT Grouper { + public: + virtual ~Grouper() = default; + + /// Construct a Grouper which receives the specified key types + static Result> Make(const std::vector& descrs, + ExecContext* ctx = default_exec_context()); + + /// Consume a batch of keys, producing the corresponding group ids as an integer array. + /// Currently only uint32 indices will be produced, eventually the bit width will only + /// be as wide as necessary. + virtual Result Consume(const ExecBatch& batch) = 0; + + /// Get current unique keys. May be called multiple times. + virtual Result GetUniques() = 0; + + /// Get the current number of groups. + virtual uint32_t num_groups() const = 0; + + /// \brief Assemble lists of indices of identical elements. + /// + /// \param[in] ids An unsigned, all-valid integral array which will be + /// used as grouping criteria. + /// \param[in] num_groups An upper bound for the elements of ids + /// \return A num_groups-long ListArray where the slot at i contains a + /// list of indices where i appears in ids. + /// + /// MakeGroupings([ + /// 2, + /// 2, + /// 5, + /// 5, + /// 2, + /// 3 + /// ], 8) == [ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ] + static Result> MakeGroupings( + const UInt32Array& ids, uint32_t num_groups, + ExecContext* ctx = default_exec_context()); + + /// \brief Produce a ListArray whose slots are selections of `array` which correspond to + /// the provided groupings. + /// + /// For example, + /// ApplyGroupings([ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ], [2, 2, 5, 5, 2, 3]) == [ + /// [], + /// [], + /// [2, 2, 2], + /// [3], + /// [], + /// [5, 5], + /// [], + /// [] + /// ] + static Result> ApplyGroupings( + const ListArray& groupings, const Array& array, + ExecContext* ctx = default_exec_context()); +}; + +/// \brief Configure a grouped aggregation +struct ARROW_EXPORT Aggregate { + /// the name of the aggregation function + std::string function; + + /// options for the aggregation function + const FunctionOptions* options; +}; + +/// Internal use only: helper function for testing HashAggregateKernels. +/// This will be replaced by streaming execution operators. +ARROW_EXPORT +Result GroupBy(const std::vector& arguments, const std::vector& keys, + const std::vector& aggregates, + ExecContext* ctx = default_exec_context()); + +} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 671c8246378..f4696fbe02a 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -61,10 +61,17 @@ static Result ExecSetLookup(const std::string& func_name, const Datum& da if (!options.value_set.is_arraylike()) { return Status::Invalid("Set lookup value set must be Array or ChunkedArray"); } + std::shared_ptr data_type; + if (data.type()->id() == Type::DICTIONARY) { + data_type = + arrow::internal::checked_pointer_cast(data.type())->value_type(); + } else { + data_type = data.type(); + } - if (options.value_set.length() > 0 && !data.type()->Equals(options.value_set.type())) { + if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) { std::stringstream ss; - ss << "Array type didn't match type of values set: " << data.type()->ToString() + ss << "Array type didn't match type of values set: " << data_type->ToString() << " vs " << options.value_set.type()->ToString(); return Status::Invalid(ss.str()); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 37f3077e4bd..730836bd118 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -68,6 +68,19 @@ struct ARROW_EXPORT SplitPatternOptions : public SplitOptions { std::string pattern; }; +struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions { + explicit ReplaceSubstringOptions(std::string pattern, std::string replacement, + int64_t max_replacements = -1) + : pattern(pattern), replacement(replacement), max_replacements(max_replacements) {} + + /// Pattern to match, literal, or regular expression depending on which kernel is used + std::string pattern; + /// String to replace the pattern with + std::string replacement; + /// Max number of substrings to replace (-1 means unbounded) + int64_t max_replacements; +}; + /// Options for IsIn and IndexIn functions struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { explicit SetLookupOptions(Datum value_set, bool skip_nulls = false) @@ -115,10 +128,25 @@ struct CompareOptions : public FunctionOptions { }; struct ARROW_EXPORT ProjectOptions : public FunctionOptions { - explicit ProjectOptions(std::vector n) : field_names(std::move(n)) {} + ProjectOptions(std::vector n, std::vector r, + std::vector> m) + : field_names(std::move(n)), + field_nullability(std::move(r)), + field_metadata(std::move(m)) {} + + explicit ProjectOptions(std::vector n) + : field_names(std::move(n)), + field_nullability(field_names.size(), true), + field_metadata(field_names.size(), NULLPTR) {} /// Names for wrapped columns std::vector field_names; + + /// Nullability bits for wrapped columns + std::vector field_nullability; + + /// Metadata attached to wrapped columns + std::vector> field_metadata; }; /// @} diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 6443c96e918..c3187a3995a 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -36,6 +36,7 @@ #include "arrow/compute/registry.h" #include "arrow/compute/util_internal.h" #include "arrow/datum.h" +#include "arrow/record_batch.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" @@ -57,6 +58,44 @@ using internal::CpuInfo; namespace compute { +ExecContext* default_exec_context() { + static ExecContext default_ctx; + return &default_ctx; +} + +ExecBatch::ExecBatch(const RecordBatch& batch) + : values(batch.num_columns()), length(batch.num_rows()) { + auto columns = batch.column_data(); + std::move(columns.begin(), columns.end(), values.begin()); +} + +Result ExecBatch::Make(std::vector values) { + if (values.empty()) { + return Status::Invalid("Cannot infer ExecBatch length without at least one value"); + } + + int64_t length = -1; + for (const auto& value : values) { + if (value.is_scalar()) { + if (length == -1) { + length = 1; + } + continue; + } + + if (length == -1) { + length = value.length(); + continue; + } + + if (length != value.length()) { + return Status::Invalid( + "Arrays used to construct an ExecBatch must have equal length"); + } + } + + return ExecBatch(std::move(values), length); +} namespace { Result> AllocateDataBuffer(KernelContext* ctx, int64_t length, @@ -838,6 +877,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { private: Status Consume(const ExecBatch& batch) { + // FIXME(ARROW-11840) don't merge *any* aggegates for every batch auto batch_state = kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}); ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); @@ -855,6 +895,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { kernel_->merge(kernel_ctx_, std::move(*batch_state), state()); ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); + return Status::OK(); } diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index f491489ed8a..7659442d8bf 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -119,6 +119,8 @@ class ARROW_EXPORT ExecContext { bool use_threads_ = true; }; +ARROW_EXPORT ExecContext* default_exec_context(); + // TODO: Consider standardizing on uint16 selection vectors and only use them // when we can ensure that each value is 64K length or smaller @@ -164,11 +166,15 @@ class ARROW_EXPORT SelectionVector { /// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight /// than is desirable for this class. Microbenchmarks would help determine for /// sure. See ARROW-8928. -struct ExecBatch { +struct ARROW_EXPORT ExecBatch { ExecBatch() = default; ExecBatch(std::vector values, int64_t length) : values(std::move(values)), length(length) {} + explicit ExecBatch(const RecordBatch& batch); + + static Result Make(std::vector values); + /// The values representing positional arguments to be passed to a kernel's /// exec function for processing. std::vector values; diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h index a74e5c8d8fa..55daa243cd3 100644 --- a/cpp/src/arrow/compute/exec_internal.h +++ b/cpp/src/arrow/compute/exec_internal.h @@ -106,6 +106,11 @@ class ARROW_EXPORT KernelExecutor { public: virtual ~KernelExecutor() = default; + /// The Kernel's `init` method must be called and any KernelState set in the + /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate + /// the case where init may be expensive and does not need to be called again for + /// each execution of the kernel, for example the same lookup table can be re-used + /// for all scanned batches in a dataset filter. virtual Status Init(KernelContext*, KernelInitArgs) = 0; /// XXX: Better configurability for listener diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index 70d7d998e9c..c8fc8b8dec0 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -126,6 +126,11 @@ const Kernel* DispatchExactImpl(const Function* func, checked_cast(func)->kernels(), values); } + if (func->kind() == Function::HASH_AGGREGATE) { + return DispatchExactImpl(checked_cast(func)->kernels(), + values); + } + return nullptr; } @@ -184,8 +189,10 @@ Result Function::Execute(const std::vector& args, executor = detail::KernelExecutor::MakeScalar(); } else if (kind() == Function::VECTOR) { executor = detail::KernelExecutor::MakeVector(); - } else { + } else if (kind() == Function::SCALAR_AGGREGATE) { executor = detail::KernelExecutor::MakeScalarAggregate(); + } else { + return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions"); } RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options})); @@ -263,6 +270,15 @@ Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) { return Status::OK(); } +Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) { + RETURN_NOT_OK(CheckArity(kernel.signature->in_types())); + if (arity_.is_varargs && !kernel.signature->is_varargs()) { + return Status::Invalid("Function accepts varargs but kernel signature does not"); + } + kernels_.emplace_back(std::move(kernel)); + return Status::OK(); +} + Result MetaFunction::Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const { diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index af5d81a30ec..9a3e1c1852f 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -133,6 +133,10 @@ class ARROW_EXPORT Function { /// A function that computes scalar summary statistics from array input. SCALAR_AGGREGATE, + /// A function that computes grouped summary statistics from array input + /// and an array of group identifiers. + HASH_AGGREGATE, + /// A function that dispatches to other functions and does not contain its /// own kernels. META @@ -307,6 +311,21 @@ class ARROW_EXPORT ScalarAggregateFunction Status AddKernel(ScalarAggregateKernel kernel); }; +class ARROW_EXPORT HashAggregateFunction + : public detail::FunctionImpl { + public: + using KernelType = HashAggregateKernel; + + HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc, + const FunctionOptions* default_options = NULLPTR) + : detail::FunctionImpl( + std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {} + + /// \brief Add a kernel (function implementation). Returns error if the + /// kernel's signature does not match the function's arity. + Status AddKernel(HashAggregateKernel kernel); +}; + /// \brief A function that dispatches to other functions. Must implement /// MetaFunction::ExecuteImpl. /// diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index c8f9cacfb34..b99b41170d2 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -537,7 +537,8 @@ struct Kernel { : signature(std::move(sig)), init(std::move(init)) {} Kernel(std::vector in_types, OutputType out_type, KernelInit init) - : Kernel(KernelSignature::Make(std::move(in_types), out_type), std::move(init)) {} + : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init)) {} /// \brief The "signature" of the kernel containing the InputType input /// argument validators and OutputType output type and shape resolver. @@ -574,7 +575,8 @@ struct ArrayKernel : public Kernel { ArrayKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR) - : Kernel(std::move(in_types), std::move(out_type), init), exec(std::move(exec)) {} + : Kernel(std::move(in_types), std::move(out_type), std::move(init)), + exec(std::move(exec)) {} /// \brief Perform a single invocation of this kernel. Depending on the /// implementation, it may only write into preallocated memory, while in some @@ -617,7 +619,7 @@ struct VectorKernel : public ArrayKernel { VectorKernel() = default; VectorKernel(std::shared_ptr sig, ArrayKernelExec exec) - : ArrayKernel(std::move(sig), exec) {} + : ArrayKernel(std::move(sig), std::move(exec)) {} VectorKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) @@ -680,12 +682,12 @@ using ScalarAggregateFinalize = std::function; /// * finalize: produces the end result of the aggregation using the /// KernelState in the KernelContext. struct ScalarAggregateKernel : public Kernel { - ScalarAggregateKernel() {} + ScalarAggregateKernel() = default; ScalarAggregateKernel(std::shared_ptr sig, KernelInit init, ScalarAggregateConsume consume, ScalarAggregateMerge merge, ScalarAggregateFinalize finalize) - : Kernel(std::move(sig), init), + : Kernel(std::move(sig), std::move(init)), consume(std::move(consume)), merge(std::move(merge)), finalize(std::move(finalize)) {} @@ -693,13 +695,59 @@ struct ScalarAggregateKernel : public Kernel { ScalarAggregateKernel(std::vector in_types, OutputType out_type, KernelInit init, ScalarAggregateConsume consume, ScalarAggregateMerge merge, ScalarAggregateFinalize finalize) - : ScalarAggregateKernel(KernelSignature::Make(std::move(in_types), out_type), init, - consume, merge, finalize) {} + : ScalarAggregateKernel( + KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {} ScalarAggregateConsume consume; ScalarAggregateMerge merge; ScalarAggregateFinalize finalize; }; +// ---------------------------------------------------------------------- +// HashAggregateKernel (for HashAggregateFunction) + +using HashAggregateConsume = std::function; + +using HashAggregateMerge = + std::function; + +// Finalize returns Datum to permit multiple return values +using HashAggregateFinalize = std::function; + +/// \brief Kernel data structure for implementations of +/// HashAggregateFunction. The four necessary components of an aggregation +/// kernel are the init, consume, merge, and finalize functions. +/// +/// * init: creates a new KernelState for a kernel. +/// * consume: processes an ExecBatch (which includes the argument as well +/// as an array of group identifiers) and updates the KernelState found in the +/// KernelContext. +/// * merge: combines one KernelState with another. +/// * finalize: produces the end result of the aggregation using the +/// KernelState in the KernelContext. +struct HashAggregateKernel : public Kernel { + HashAggregateKernel() = default; + + HashAggregateKernel(std::shared_ptr sig, KernelInit init, + HashAggregateConsume consume, HashAggregateMerge merge, + HashAggregateFinalize finalize) + : Kernel(std::move(sig), std::move(init)), + consume(std::move(consume)), + merge(std::move(merge)), + finalize(std::move(finalize)) {} + + HashAggregateKernel(std::vector in_types, OutputType out_type, + KernelInit init, HashAggregateMerge merge, + HashAggregateConsume consume, HashAggregateFinalize finalize) + : HashAggregateKernel( + KernelSignature::Make(std::move(in_types), std::move(out_type)), + std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {} + + HashAggregateConsume consume; + HashAggregateMerge merge; + HashAggregateFinalize finalize; +}; + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 577b250da87..5e223a1f906 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -59,5 +59,9 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute") # Aggregates -add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc) +add_arrow_compute_test(aggregate_test + SOURCES + aggregate_test.cc + hash_aggregate_test.cc + test_util.cc) add_arrow_benchmark(aggregate_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index ac2604c9e89..61dc8cb403c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -99,37 +99,11 @@ std::unique_ptr CountInit(KernelContext*, const KernelInitArgs& arg // ---------------------------------------------------------------------- // Sum implementation -// Round size optimized based on data type and compiler -template -struct RoundSizeDefault { - static constexpr int64_t size = 16; -}; - -// Round size set to 32 for float/int32_t/uint32_t -template <> -struct RoundSizeDefault { - static constexpr int64_t size = 32; -}; - -template <> -struct RoundSizeDefault { - static constexpr int64_t size = 32; -}; - -template <> -struct RoundSizeDefault { - static constexpr int64_t size = 32; -}; - template -struct SumImplDefault - : public SumImpl::CType>::size, - ArrowType, SimdLevel::NONE> {}; +struct SumImplDefault : public SumImpl {}; template -struct MeanImplDefault - : public MeanImpl::CType>::size, - ArrowType, SimdLevel::NONE> {}; +struct MeanImplDefault : public MeanImpl {}; std::unique_ptr SumInit(KernelContext* ctx, const KernelInitArgs& args) { SumLikeInit visitor(ctx, *args.inputs[0].type); @@ -276,15 +250,13 @@ const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numer {"array"}, "MinMaxOptions"}; -const FunctionDoc any_doc{ - "Test whether any element in a boolean array evaluates to true.", - ("Null values are ignored."), - {"array"}}; +const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true", + ("Null values are ignored."), + {"array"}}; -const FunctionDoc all_doc{ - "Test whether all elements in a boolean array evaluate to true.", - ("Null values are ignored."), - {"array"}}; +const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true", + ("Null values are ignored."), + {"array"}}; } // namespace diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc index e0c1118c714..feeb66a1489 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc @@ -24,37 +24,11 @@ namespace aggregate { // ---------------------------------------------------------------------- // Sum implementation -// Round size optimized based on data type and compiler -template -struct RoundSizeAvx2 { - static constexpr int64_t size = 32; -}; - -// Round size set to 64 for float/int32_t/uint32_t -template <> -struct RoundSizeAvx2 { - static constexpr int64_t size = 64; -}; - -template <> -struct RoundSizeAvx2 { - static constexpr int64_t size = 64; -}; - -template <> -struct RoundSizeAvx2 { - static constexpr int64_t size = 64; -}; - template -struct SumImplAvx2 - : public SumImpl::CType>::size, - ArrowType, SimdLevel::AVX2> {}; +struct SumImplAvx2 : public SumImpl {}; template -struct MeanImplAvx2 - : public MeanImpl::CType>::size, - ArrowType, SimdLevel::AVX2> {}; +struct MeanImplAvx2 : public MeanImpl {}; std::unique_ptr SumInitAvx2(KernelContext* ctx, const KernelInitArgs& args) { SumLikeInit visitor(ctx, *args.inputs[0].type); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc index c2c748d3af7..522564a8469 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc @@ -24,37 +24,11 @@ namespace aggregate { // ---------------------------------------------------------------------- // Sum implementation -// Round size optimized based on data type and compiler -template -struct RoundSizeAvx512 { - static constexpr int64_t size = 32; -}; - -// Round size set to 64 for float/int32_t/uint32_t -template <> -struct RoundSizeAvx512 { - static constexpr int64_t size = 64; -}; - -template <> -struct RoundSizeAvx512 { - static constexpr int64_t size = 64; -}; - -template <> -struct RoundSizeAvx512 { - static constexpr int64_t size = 64; -}; - template -struct SumImplAvx512 - : public SumImpl::CType>::size, - ArrowType, SimdLevel::AVX512> {}; +struct SumImplAvx512 : public SumImpl {}; template -struct MeanImplAvx512 - : public MeanImpl::CType>::size, - ArrowType, SimdLevel::AVX512> {}; +struct MeanImplAvx512 : public MeanImpl {}; std::unique_ptr SumInitAvx512(KernelContext* ctx, const KernelInitArgs& args) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 733e6d1d0a6..5029c1855c0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -51,229 +51,49 @@ void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func); // ---------------------------------------------------------------------- // Sum implementation -template -struct SumState { - using SumType = typename FindAccumulatorType::Type; - using ThisType = SumState; - using T = typename TypeTraits::CType; - using ArrayType = typename TypeTraits::ArrayType; - - ThisType operator+(const ThisType& rhs) const { - return ThisType(this->count + rhs.count, this->sum + rhs.sum); - } - - ThisType& operator+=(const ThisType& rhs) { - this->count += rhs.count; - this->sum += rhs.sum; - - return *this; - } - - public: - void Consume(const Array& input) { - const ArrayType& array = static_cast(input); - if (input.null_count() == 0) { - (*this) += ConsumeNoNulls(array); - } else { - (*this) += ConsumeWithNulls(array); - } - } - - size_t count = 0; - typename SumType::c_type sum = 0; - - private: - template - ThisType ConsumeNoNulls(const T* values, const int64_t length) const { - ThisType local; - const int64_t length_rounded = BitUtil::RoundDown(length, kNoNullsRoundSize); - typename SumType::c_type sum_rounded[kNoNullsRoundSize] = {0}; - - // Unroll the loop to add the results in parallel - for (int64_t i = 0; i < length_rounded; i += kNoNullsRoundSize) { - for (int64_t k = 0; k < kNoNullsRoundSize; k++) { - sum_rounded[k] += values[i + k]; - } - } - for (int64_t k = 0; k < kNoNullsRoundSize; k++) { - local.sum += sum_rounded[k]; - } - - // The trailing part - for (int64_t i = length_rounded; i < length; ++i) { - local.sum += values[i]; - } - - local.count = length; - return local; - } - - ThisType ConsumeNoNulls(const ArrayType& array) const { - const auto values = array.raw_values(); - const int64_t length = array.length(); - - return ConsumeNoNulls(values, length); - } - - // While this is not branchless, gcc needs this to be in a different function - // for it to generate cmov which tends to be slightly faster than - // multiplication but safe for handling NaN with doubles. - inline T MaskedValue(bool valid, T value) const { return valid ? value : 0; } - - inline ThisType UnrolledSum(uint8_t bits, const T* values) const { - ThisType local; - - if (bits < 0xFF) { - // Some nulls - for (size_t i = 0; i < 8; i++) { - local.sum += MaskedValue(bits & (1U << i), values[i]); - } - local.count += BitUtil::kBytePopcount[bits]; - } else { - // No nulls - for (size_t i = 0; i < 8; i++) { - local.sum += values[i]; - } - local.count += 8; - } - - return local; - } - - ThisType ConsumeWithNulls(const ArrayType& array) const { - ThisType local; - const T* values = array.raw_values(); - const int64_t length = array.length(); - int64_t offset = array.offset(); - const uint8_t* bitmap = array.null_bitmap_data(); - int64_t idx = 0; - - const auto p = arrow::internal::BitmapWordAlign<1>(bitmap, offset, length); - // First handle the leading bits - const int64_t leading_bits = p.leading_bits; - while (idx < leading_bits) { - if (BitUtil::GetBit(bitmap, offset)) { - local.sum += values[idx]; - local.count++; - } - idx++; - offset++; - } - - // The aligned parts scanned with BitBlockCounter - constexpr int64_t kBatchSize = arrow::internal::BitBlockCounter::kWordBits; - arrow::internal::BitBlockCounter data_counter(bitmap, offset, length - leading_bits); - auto current_block = data_counter.NextWord(); - while (idx < length) { - if (current_block.AllSet()) { // All true values - int run_length = 0; - // Scan forward until a block that has some false values (or the end) - while (current_block.length > 0 && current_block.AllSet()) { - run_length += current_block.length; - current_block = data_counter.NextWord(); - } - // Aggregate the no nulls parts - if (run_length >= kRoundSize * 8) { - local += ConsumeNoNulls(&values[idx], run_length); - } else { - local += ConsumeNoNulls<8>(&values[idx], run_length); - } - idx += run_length; - offset += run_length; - // The current_block already computed, advance to next loop - continue; - } else if (!current_block.NoneSet()) { // Some values are null - const int64_t idx_byte = BitUtil::BytesForBits(offset); - const uint8_t* aligned_bitmap = &bitmap[idx_byte]; - const T* aligned_values = &values[idx]; - - if (kBatchSize == current_block.length) { - for (int64_t i = 0; i < kBatchSize / 8; i++) { - local += UnrolledSum(aligned_bitmap[i], &aligned_values[i * 8]); - } - } else { // The end part - for (int64_t i = 0; i < current_block.length; i++) { - if (BitUtil::GetBit(bitmap, offset)) { - local.sum += values[idx]; - local.count++; - } - idx++; - offset++; - } - } - - idx += current_block.length; - offset += current_block.length; - } else { // All null values - idx += current_block.length; - offset += current_block.length; - } - current_block = data_counter.NextWord(); - } - - return local; - } -}; - -template -struct SumState { - using SumType = typename FindAccumulatorType::Type; - using ThisType = SumState; - - ThisType& operator+=(const ThisType& rhs) { - this->count += rhs.count; - this->sum += rhs.sum; - return *this; - } - - public: - void Consume(const Array& input) { - const BooleanArray& array = static_cast(input); - count += array.length() - array.null_count(); - sum += array.true_count(); - } - - size_t count = 0; - typename SumType::c_type sum = 0; -}; - -template +template struct SumImpl : public ScalarAggregator { - using ArrayType = typename TypeTraits::ArrayType; - using ThisType = SumImpl; + using ThisType = SumImpl; + using CType = typename ArrowType::c_type; using SumType = typename FindAccumulatorType::Type; using OutputType = typename TypeTraits::ScalarType; void Consume(KernelContext*, const ExecBatch& batch) override { - this->state.Consume(ArrayType(batch[0].array())); + const auto& data = batch[0].array(); + this->count = data->length - data->GetNullCount(); + if (is_boolean_type::value) { + this->sum = static_cast(BooleanArray(data).true_count()); + } else { + this->sum = + arrow::compute::detail::SumArray(*data); + } } void MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); - this->state += other.state; + this->count += other.count; + this->sum += other.sum; } void Finalize(KernelContext*, Datum* out) override { - if (state.count == 0) { + if (this->count == 0) { out->value = std::make_shared(); } else { - out->value = MakeScalar(state.sum); + out->value = MakeScalar(this->sum); } } - SumState state; + size_t count = 0; + typename SumType::c_type sum = 0; }; -template -struct MeanImpl : public SumImpl { +template +struct MeanImpl : public SumImpl { void Finalize(KernelContext*, Datum* out) override { - const bool is_valid = this->state.count > 0; - const double divisor = static_cast(is_valid ? this->state.count : 1UL); - const double mean = static_cast(this->state.sum) / divisor; - - if (!is_valid) { + if (this->count == 0) { out->value = std::make_shared(); } else { + const double mean = static_cast(this->sum) / this->count; out->value = std::make_shared(mean); } } diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index c90dd03c06e..42be0c36544 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -300,6 +300,169 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) ->Apply(BenchmarkSetArgs); #endif // ARROW_WITH_BENCHMARKS_REFERENCE +// +// GroupBy +// + +static void BenchmarkGroupBy(benchmark::State& state, + std::vector aggregates, + std::vector arguments, std::vector keys) { + for (auto _ : state) { + ABORT_NOT_OK(GroupBy(arguments, keys, aggregates).status()); + } +} + +#define GROUP_BY_BENCHMARK(Name, Impl) \ + static void Name(benchmark::State& state) { \ + RegressionArgs args(state, false); \ + auto rng = random::RandomArrayGenerator(1923); \ + (Impl)(); \ + } \ + BENCHMARK(Name)->Apply([](benchmark::internal::Benchmark* bench) { \ + BenchmarkSetArgsWithSizes(bench, {1 * 1024 * 1024}); \ + }) + +GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyStringSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.StringWithRepeats(args.size, + /*unique=*/16, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallStringSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.StringWithRepeats(args.size, + /*unique=*/256, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumStringSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.StringWithRepeats(args.size, + /*unique=*/4096, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntegerSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.Int64(args.size, + /*min=*/0, + /*max=*/15); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntegerSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.Int64(args.size, + /*min=*/0, + /*max=*/255); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntegerSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto key = rng.Int64(args.size, + /*min=*/0, + /*max=*/4095); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntStringPairSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto int_key = rng.Int64(args.size, + /*min=*/0, + /*max=*/4); + auto str_key = rng.StringWithRepeats(args.size, + /*unique=*/4, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntStringPairSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto int_key = rng.Int64(args.size, + /*min=*/0, + /*max=*/15); + auto str_key = rng.StringWithRepeats(args.size, + /*unique=*/16, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key}); +}); + +GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] { + auto summand = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + + auto int_key = rng.Int64(args.size, + /*min=*/0, + /*max=*/63); + auto str_key = rng.StringWithRepeats(args.size, + /*unique=*/64, + /*min_length=*/3, + /*max_length=*/32); + + BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key}); +}); + // // Sum // diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index cb67794d942..67337f22c5b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -19,6 +19,8 @@ #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/logging.h" namespace arrow { namespace compute { @@ -57,5 +59,107 @@ void AddAggKernel(std::shared_ptr sig, KernelInit init, ScalarAggregateFunction* func, SimdLevel::type simd_level = SimdLevel::NONE); +namespace detail { + +using arrow::internal::VisitSetBitRunsVoid; + +// non-recursive pairwise summation for floating points +// https://en.wikipedia.org/wiki/Pairwise_summation +template +enable_if_t::value, SumType> SumArray( + const ArrayData& data, ValueFunc&& func) { + const int64_t data_size = data.length - data.GetNullCount(); + if (data_size == 0) { + return 0; + } + + // number of inputs to accumulate before merging with another block + constexpr int kBlockSize = 16; // same as numpy + // levels (tree depth) = ceil(log2(len)) + 1, a bit larger than necessary + const int levels = BitUtil::Log2(static_cast(data_size)) + 1; + // temporary summation per level + std::vector sum(levels); + // whether two summations are ready and should be reduced to upper level + // one bit for each level, bit0 -> level0, ... + uint64_t mask = 0; + // level of root node holding the final summation + int root_level = 0; + + // reduce summation of one block (may be smaller than kBlockSize) from leaf node + // continue reducing to upper level if two summations are ready for non-leaf node + auto reduce = [&](SumType block_sum) { + int cur_level = 0; + uint64_t cur_level_mask = 1ULL; + sum[cur_level] += block_sum; + mask ^= cur_level_mask; + while ((mask & cur_level_mask) == 0) { + block_sum = sum[cur_level]; + sum[cur_level] = 0; + ++cur_level; + DCHECK_LT(cur_level, levels); + cur_level_mask <<= 1; + sum[cur_level] += block_sum; + mask ^= cur_level_mask; + } + root_level = std::max(root_level, cur_level); + }; + + const ValueType* values = data.GetValues(1); + VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length, + [&](int64_t pos, int64_t len) { + const ValueType* v = &values[pos]; + // unsigned division by constant is cheaper than signed one + const uint64_t blocks = static_cast(len) / kBlockSize; + const uint64_t remains = static_cast(len) % kBlockSize; + + for (uint64_t i = 0; i < blocks; ++i) { + SumType block_sum = 0; + for (int j = 0; j < kBlockSize; ++j) { + block_sum += func(v[j]); + } + reduce(block_sum); + v += kBlockSize; + } + + if (remains > 0) { + SumType block_sum = 0; + for (uint64_t i = 0; i < remains; ++i) { + block_sum += func(v[i]); + } + reduce(block_sum); + } + }); + + // reduce intermediate summations from all non-leaf nodes + for (int i = 1; i <= root_level; ++i) { + sum[i] += sum[i - 1]; + } + + return sum[root_level]; +} + +// naive summation for integers +template +enable_if_t::value, SumType> SumArray( + const ArrayData& data, ValueFunc&& func) { + SumType sum = 0; + const ValueType* values = data.GetValues(1); + VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length, + [&](int64_t pos, int64_t len) { + for (int64_t i = 0; i < len; ++i) { + sum += func(values[pos + i]); + } + }); + return sum; +} + +template +SumType SumArray(const ArrayData& data) { + return SumArray( + data, [](ValueType v) { return static_cast(v); }); +} + +} // namespace detail + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index e772d474909..22e7f512e97 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -27,19 +27,26 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" #include "arrow/compute/api_aggregate.h" +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/registry.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_internal.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/util/logging.h" namespace arrow { +using internal::BitmapReader; using internal::checked_cast; using internal::checked_pointer_cast; @@ -65,8 +72,7 @@ static SumResult NaiveSumPartial(const Array& array) { const auto values = array_numeric.raw_values(); if (array.null_count() != 0) { - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), - array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); i++) { if (reader.IsSet()) { result.first += values[i]; @@ -264,6 +270,29 @@ TYPED_TEST(TestRandomNumericSumKernel, RandomSliceArraySum) { } } +// Test round-off error +class TestSumKernelRoundOff : public ::testing::Test {}; + +TEST_F(TestSumKernelRoundOff, Basics) { + using ScalarType = typename TypeTraits::ScalarType; + + // array = np.arange(321000, dtype='float64') + // array -= np.mean(array) + // array *= arrray + double index = 0; + ASSERT_OK_AND_ASSIGN( + auto array, ArrayFromBuilderVisitor( + float64(), 321000, [&](NumericBuilder* builder) { + builder->UnsafeAppend((index - 160499.5) * (index - 160499.5)); + ++index; + })); + + // reference value from numpy.sum() + ASSERT_OK_AND_ASSIGN(Datum result, Sum(array)); + auto sum = checked_cast(result.scalar().get()); + ASSERT_EQ(sum->value, 2756346749973250.0); +} + // // Count // @@ -465,9 +494,7 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test { void AssertMinMaxIsNull(const Datum& array, const MinMaxOptions& options) { ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options)); - - const StructScalar& value = out.scalar_as(); - for (const auto& val : value.value) { + for (const auto& val : out.scalar_as().value) { ASSERT_FALSE(val->is_valid); } } @@ -623,8 +650,7 @@ static enable_if_integer> NaiveMinMax( T min = std::numeric_limits::max(); T max = std::numeric_limits::min(); if (array.null_count() != 0) { // Some values are null - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), - array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); i++) { if (reader.IsSet()) { min = std::min(min, values[i]); @@ -663,8 +689,7 @@ static enable_if_floating_point> NaiveMinMax( T min = std::numeric_limits::infinity(); T max = -std::numeric_limits::infinity(); if (array.null_count() != 0) { // Some values are null - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), - array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); i++) { if (reader.IsSet()) { min = std::fmin(min, values[i]); @@ -1007,7 +1032,7 @@ ModeResult NaiveMode(const Array& array) { const auto& array_numeric = reinterpret_cast(array); const auto values = array_numeric.raw_values(); - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); for (int64_t i = 0; i < array.length(); ++i) { if (reader.IsSet()) { ++value_counts[values[i]]; @@ -1205,6 +1230,26 @@ TEST_F(TestVarStdKernelMergeStability, Basics) { #endif } +// Test round-off error +template +class TestVarStdKernelRoundOff : public TestPrimitiveVarStdKernel {}; + +typedef ::testing::Types VarStdRoundOffTypes; + +TYPED_TEST_SUITE(TestVarStdKernelRoundOff, VarStdRoundOffTypes); +TYPED_TEST(TestVarStdKernelRoundOff, Basics) { + // build array: np.arange(321000, dtype='xxx') + typename TypeParam::c_type value = 0; + ASSERT_OK_AND_ASSIGN( + auto array, ArrayFromBuilderVisitor(TypeTraits::type_singleton(), 321000, + [&](NumericBuilder* builder) { + builder->UnsafeAppend(value++); + })); + + // reference value from numpy.var() + this->AssertVarStdIs(*array, VarianceOptions{0}, 8586749999.916667); +} + // Test integer arithmetic code class TestVarStdKernelInt32 : public TestPrimitiveVarStdKernel {}; @@ -1238,7 +1283,7 @@ void KahanSum(double& sum, double& adjust, double addend) { template std::pair WelfordVar(const ArrayType& array) { const auto values = array.raw_values(); - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); + BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); double count = 0, mean = 0, m2 = 0; double mean_adjust = 0, m2_adjust = 0; for (int64_t i = 0; i < array.length(); ++i) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc index 6e1d9300f42..d11e73efd77 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -30,6 +30,7 @@ namespace internal { namespace { using arrow::internal::int128_t; +using arrow::internal::VisitSetBitRunsVoid; template struct VarStdState { @@ -49,24 +50,13 @@ struct VarStdState { using SumType = typename std::conditional::value, double, int128_t>::type; - SumType sum = 0; - - const ArrayData& data = *array.data(); - const CType* values = data.GetValues(1); - arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length, - [&](int64_t pos, int64_t len) { - for (int64_t i = 0; i < len; ++i) { - sum += static_cast(values[pos + i]); - } - }); - - double mean = static_cast(sum) / count, m2 = 0; - arrow::internal::VisitSetBitRunsVoid( - data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) { - for (int64_t i = 0; i < len; ++i) { - const double v = static_cast(values[pos + i]); - m2 += (v - mean) * (v - mean); - } + SumType sum = arrow::compute::detail::SumArray(*array.data()); + + const double mean = static_cast(sum) / count; + const double m2 = arrow::compute::detail::SumArray( + *array.data(), [mean](CType value) { + const double v = static_cast(value); + return (v - mean) * (v - mean); }); this->count = count; @@ -98,14 +88,14 @@ struct VarStdState { int128_t square_sum = 0; const ArrayData& data = *slice->data(); const CType* values = data.GetValues(1); - arrow::internal::VisitSetBitRunsVoid( - data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) { - for (int64_t i = 0; i < len; ++i) { - const auto value = values[pos + i]; - sum += value; - square_sum += static_cast(value) * value; - } - }); + VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length, + [&](int64_t pos, int64_t len) { + for (int64_t i = 0; i < len; ++i) { + const auto value = values[pos + i]; + sum += value; + square_sum += static_cast(value) * value; + } + }); const double mean = static_cast(sum) / count; // calculate m2 = square_sum - sum * sum / count diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index b321ff3fc8b..ad43b7a3aa9 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -48,6 +48,7 @@ std::vector> g_numeric_types; std::vector> g_base_binary_types; std::vector> g_temporal_types; std::vector> g_primitive_types; +std::vector g_decimal_type_ids; static std::once_flag codegen_static_initialized; template @@ -71,6 +72,9 @@ static void InitStaticData() { // Floating point types g_floating_types = {float32(), float64()}; + // Decimal types + g_decimal_type_ids = {Type::DECIMAL128, Type::DECIMAL256}; + // Numeric types Extend(g_int_types, &g_numeric_types); Extend(g_floating_types, &g_numeric_types); @@ -132,6 +136,11 @@ const std::vector>& FloatingPointTypes() { return g_floating_types; } +const std::vector& DecimalTypeIds() { + std::call_once(codegen_static_initialized, InitStaticData); + return g_decimal_type_ids; +} + const std::vector& AllTimeUnits() { static std::vector units = {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}; diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 8c49e796623..b5d6c3807f1 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -188,6 +188,16 @@ struct GetViewType { } }; +template <> +struct GetViewType { + using T = Decimal256; + using PhysicalType = util::string_view; + + static T LogicalValue(PhysicalType value) { + return Decimal256(reinterpret_cast(value.data())); + } +}; + template struct GetOutputType; @@ -206,6 +216,11 @@ struct GetOutputType { using T = Decimal128; }; +template <> +struct GetOutputType { + using T = Decimal256; +}; + // ---------------------------------------------------------------------- // Iteration / value access utilities @@ -313,6 +328,13 @@ struct UnboxScalar { } }; +template <> +struct UnboxScalar { + static Decimal256 Unbox(const Scalar& val) { + return checked_cast(val).value; + } +}; + template struct BoxScalar; @@ -339,6 +361,13 @@ struct BoxScalar { static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } }; +template <> +struct BoxScalar { + using T = Decimal256; + using ScalarType = Decimal256Scalar; + static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } +}; + // A VisitArrayDataInline variant that calls its visitor function with logical // values, such as Decimal128 rather than util::string_view. @@ -396,6 +425,7 @@ const std::vector>& SignedIntTypes(); const std::vector>& UnsignedIntTypes(); const std::vector>& IntTypes(); const std::vector>& FloatingPointTypes(); +const std::vector& DecimalTypeIds(); ARROW_EXPORT const std::vector& AllTimeUnits(); @@ -659,12 +689,13 @@ struct ScalarUnaryNotNullStateful { }; template - struct ArrayExec::value>> { + struct ArrayExec> { static void Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, Datum* out) { ArrayData* out_arr = out->mutable_array(); // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian - using endian_agnostic = std::array; + using endian_agnostic = + std::array::ScalarType::ValueType)>; auto out_data = out_arr->GetMutableValues(1); VisitArrayValuesInline( arg0, @@ -1185,6 +1216,22 @@ ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) { } } +// Generate a kernel given a templated functor for decimal types +// +// See "Numeric" above for description of the generator functor +template