diff --git a/.env b/.env
index 3c146239ef2..cd6b57e004a 100644
--- a/.env
+++ b/.env
@@ -69,5 +69,6 @@ R_TAG=latest
 # -1 does not attempt to install a devtoolset version, any positive integer will install devtoolset-n
 DEVTOOLSET_VERSION=-1
 
-# Used for the manylinux and windows wheels
+# Used for the manylinux and windows wheels, please update the crossbow configuration on update:
+#   https://github.com/ursacomputing/crossbow/blob/master/.github/workflows/cache_vcpkg.yml
 VCPKG=fced4bef1606260f110d74de1ae1975c2b9ac549
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 8869de77b34..0bdecac2d6d 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -20,24 +20,24 @@ name: R
 on:
   push:
     paths:
-      - '.github/workflows/r.yml'
-      - 'ci/scripts/r_*.sh'
-      - 'ci/scripts/cpp_*.sh'
-      - 'ci/scripts/PKGBUILD'
-      - 'ci/etc/rprofile'
-      - 'ci/docker/**'
-      - 'cpp/**'
-      - 'r/**'
+      - ".github/workflows/r.yml"
+      - "ci/scripts/r_*.sh"
+      - "ci/scripts/cpp_*.sh"
+      - "ci/scripts/PKGBUILD"
+      - "ci/etc/rprofile"
+      - "ci/docker/**"
+      - "cpp/**"
+      - "r/**"
   pull_request:
     paths:
-      - '.github/workflows/r.yml'
-      - 'ci/scripts/r_*.sh'
-      - 'ci/scripts/cpp_*.sh'
-      - 'ci/scripts/PKGBUILD'
-      - 'ci/etc/rprofile'
-      - 'ci/docker/**'
-      - 'cpp/**'
-      - 'r/**'
+      - ".github/workflows/r.yml"
+      - "ci/scripts/r_*.sh"
+      - "ci/scripts/cpp_*.sh"
+      - "ci/scripts/PKGBUILD"
+      - "ci/etc/rprofile"
+      - "ci/docker/**"
+      - "cpp/**"
+      - "r/**"
 
 env:
   DOCKER_VOLUME_PREFIX: ".docker/"
@@ -86,6 +86,15 @@ jobs:
       - name: Dump install logs
         run: cat r/check/arrow.Rcheck/00install.out
         if: always()
+      - name: Dump test logs
+        run: cat r/check/arrow.Rcheck/tests/testthat.Rout*
+        if: always()
+      - name: Save the test output
+        if: always()
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-output
+          path: r/check/arrow.Rcheck/tests/testthat.Rout*
       - name: Docker Push
         if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
         continue-on-error: true
@@ -99,8 +108,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {org: 'rstudio', image: 'r-base', tag: '4.0-centos7'}
-          - {org: 'rhub', image: 'debian-gcc-devel', tag: 'latest'}
+          - { org: "rstudio", image: "r-base", tag: "4.0-centos7" }
+          - { org: "rhub", image: "debian-gcc-devel", tag: "latest" }
     env:
       R_ORG: ${{ matrix.config.org }}
       R_IMAGE: ${{ matrix.config.image }}
@@ -134,6 +143,15 @@ jobs:
       - name: Dump install logs
         run: cat r/check/arrow.Rcheck/00install.out
         if: always()
+      - name: Dump test logs
+        run: cat r/check/arrow.Rcheck/tests/testthat.Rout*
+        if: always()
+      - name: Save the test output
+        if: always()
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-output
+          path: r/check/arrow.Rcheck/tests/testthat.Rout*
       - name: Docker Push
         if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
         continue-on-error: true
@@ -149,7 +167,7 @@ jobs:
         rtools: [35, 40]
     env:
       TEST_R_WITH_ARROW: "TRUE"
-      ARROW_R_CXXFLAGS: '-Werror'
+      ARROW_R_CXXFLAGS: "-Werror"
       _R_CHECK_TESTS_NLINES_: 0
     steps:
       - run: git config --global core.autocrlf false
@@ -187,13 +205,13 @@ jobs:
       - uses: r-lib/actions/setup-r@master
         with:
           rtools-version: 40
-          r-version: '4.0'
+          r-version: "4.0"
           Ncpus: 2
       - uses: r-lib/actions/setup-r@master
         if: ${{ matrix.rtools == 35 }}
         with:
           rtools-version: 35
-          r-version: '3.6'
+          r-version: "3.6"
           Ncpus: 2
       - name: Build Arrow C++
         shell: bash
@@ -221,7 +239,8 @@ jobs:
             build_args = '--no-build-vignettes',
             args = c('--no-manual', '--as-cran', '--ignore-vignettes', '--run-donttest'),
             error_on = 'warning',
-            check_dir = 'check'
+            check_dir = 'check',
+            timeout = 3600
           )
       - name: Dump install logs
         shell: cmd
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 90831341d6c..1dd220ade94 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -116,8 +116,8 @@ jobs:
           # run tests on all workspace members with default feature list
           cargo test
           # test datafusion examples
-          cd datafusion
-          cargo test --no-default-features --features cli
+          cd datafusion-examples
+          cargo test --no-default-features
           cargo run --example csv_sql
           cargo run --example parquet_sql
           cd ..
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e70eaceaf41..9d2d2d81d68 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,9 +40,10 @@ repos:
       - id: cmake-format
         name: CMake Format
         language: python
-        entry: bash -c "pip install cmake-format && python run-cmake-format.py --check"
-        entry: echo
-        files: ^(.*/CMakeLists.txt|.*.cmake)$
+        entry: python run-cmake-format.py
+        types: [cmake]
+        additional_dependencies:
+          - cmake_format==0.5.2
       - id: hadolint
         name: Docker Format
         language: docker_image
diff --git a/c_glib/README.md b/c_glib/README.md
index 4ef9612c868..ac179354d8f 100644
--- a/c_glib/README.md
+++ b/c_glib/README.md
@@ -19,12 +19,14 @@
 
 # Arrow GLib
 
-Arrow GLib is a wrapper library for [Arrow C++](https://github.com/apache/arrow/tree/master/cpp). Arrow GLib provides C
-API.
+Arrow GLib is a wrapper library for [Arrow
+C++](https://github.com/apache/arrow/tree/master/cpp). Arrow GLib
+provides C API.
 
-Arrow GLib supports
-[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection).
-It means that you can create language bindings at runtime or compile time.
+Arrow GLib supports [GObject
+Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection).
+It means that you can create language bindings at runtime or compile
+time.
 
 For example, you can use Apache Arrow from Ruby by Arrow GLib and
 [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection)
@@ -50,71 +52,47 @@ gobject-introspection gem based bindings.
 You can use packages or build by yourself to install Arrow GLib. It's
 recommended that you use packages.
 
-Note that the packages are "unofficial". "Official" packages will be
-released in the future.
+We use Meson and Ninja as build tools. If you find problems when
+installing please see [common build
+problems](https://github.com/apache/arrow/blob/master/c_glib/README.md#common-build-problems).
 
-We support two build systems, GNU Autotools and Meson. If you find problems when installing please see [common build problems](https://github.com/apache/arrow/blob/master/c_glib/README.md#common-build-problems).
-
-### Package
+### Packages
 
 See [install document](https://arrow.apache.org/install/) for details.
 
 ### How to build by users
 
 Arrow GLib users should use released source archive to build Arrow
-GLib (replace the version number in the following commands with the one you use):
+GLib (replace the version number in the following commands with the
+one you use):
 
 ```console
-% wget https://archive.apache.org/dist/arrow/arrow-0.3.0/apache-arrow-0.3.0.tar.gz
-% tar xf apache-arrow-0.3.0.tar.gz
-% cd apache-arrow-0.3.0
+% wget https://downloads.apache.org/arrow/arrow-3.0.0/apache-arrow-3.0.0.tar.gz
+% tar xf apache-arrow-3.0.0.tar.gz
+% cd apache-arrow-3.0.0
 ```
 
 You need to build and install Arrow C++ before you build and install
 Arrow GLib. See Arrow C++ document about how to install Arrow C++.
 
-If you use macOS with [Homebrew](https://brew.sh/), you must install required packages and set `PKG_CONFIG_PATH` before build Arrow GLib:
-
-If you use GNU Autotools, you can build and install Arrow GLib by the followings:
-
-macOS:
-
-```console
-% cd c_glib
-% brew bundle
-% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH
-% make
-% sudo make install
-```
-
-Others:
-
-```console
-% cd c_glib
-% ./configure
-% make
-% sudo make install
-```
-
-If you use Meson, you can build and install Arrow GLib by the followings:
+If you use macOS with [Homebrew](https://brew.sh/), you must install
+required packages.
 
 macOS:
 
 ```console
-% cd c_glib
-% brew bundle
-% PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH meson build --buildtype=release
-% ninja -C build
-% sudo ninja -C build install
+$ brew bundle
+$ meson setup c_glib.build c_glib --buildtype=release
+$ meson compile -C c_glib.build
+$ sudo meson install -C c_glib.build
 ```
 
 Others:
 
 ```console
-% cd c_glib
-% meson build --buildtype=release
-% ninja -C build
-% sudo ninja -C build install
+$ meson setup c_glib.build c_glib --buildtype=release
+$ meson compile -C c_glib.build
+$ sudo meson install -C build
 ```
 
 ### How to build by developers
@@ -129,51 +107,46 @@ to build Arrow GLib. You can install them by the followings:
 On Debian GNU/Linux or Ubuntu:
 
 ```console
-% sudo apt install -y -V gtk-doc-tools autoconf-archive libgirepository1.0-dev meson ninja-build
+$ sudo apt install -y -V gtk-doc-tools libgirepository1.0-dev meson ninja-build
 ```
 
-On CentOS 7 or later:
+On CentOS 7:
 
 ```console
-% sudo yum install -y gtk-doc gobject-introspection-devel
-% sudo pip install -y meson ninja
-```
-
-On macOS with [Homebrew](https://brew.sh/):
-
-```text
-% brew bundle
+$ sudo yum install -y gtk-doc gobject-introspection-devel ninja-build
+$ sudo pip3 install meson
 ```
 
-If you use GNU Autotools, you can build and install Arrow GLib by the followings:
+On CentOS 8 or later:
 
 ```console
-% cd c_glib
-% ./autogen.sh
-% ./configure --enable-gtk-doc
-% make
-% sudo make install
+$ sudo dnf install -y --enablerepo=powertools gtk-doc gobject-introspection-devel ninja-build
+$ sudo pip3 install meson
 ```
 
-You need to set `PKG_CONFIG_PATH` to `configure` On macOS:
+On macOS with [Homebrew](https://brew.sh/):
 
 ```console
-% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH --enable-gtk-doc
+$ brew bundle
 ```
 
-If you use Meson, you can build and install Arrow GLib by the followings:
+You can build and install Arrow GLib by the followings:
+
+macOS:
 
 ```console
-% cd c_glib
-% meson build -Dgtk_doc=true
-% ninja -C build
-% sudo ninja -C build install
+$ XML_CATALOG_FILES=$(brew --prefix)/etc/xml/catalog
+$ meson setup c_glib.build c_glib -Dgtk_doc=true
+$ meson compile -C c_glib.build
+$ sudo meson install -C c_glib.build
 ```
 
-You need to set `PKG_CONFIG_PATH` on macOS:
+Others:
 
 ```console
-% PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH meson build -Dgtk_doc=true
+$ meson c_glib.build c_glib -Dgtk_doc=true
+$ meson compile -C c_glib.build
+$ sudo meson install -C c_glib.build
 ```
 
 ## Usage
@@ -186,7 +159,7 @@ languages, you use GObject Introspection based bindings.
 
 You can find API reference in the
 `/usr/local/share/gtk-doc/html/arrow-glib/` directory. If you specify
-`--prefix` to `configure`, the directory will be different.
+`--prefix` to `meson`, the directory will be different.
 
 You can find example codes in the `example/` directory.
 
@@ -225,101 +198,118 @@ You can install them by the followings:
 On Debian GNU/Linux or Ubuntu:
 
 ```console
-% sudo apt install -y -V ruby-dev
-% sudo gem install bundler
-% (cd c_glib && bundle install)
+$ sudo apt install -y -V ruby-dev
+$ sudo gem install bundler
+$ (cd c_glib && bundle install)
 ```
 
 On CentOS 7 or later:
 
 ```console
-% sudo yum install -y git
-% git clone https://github.com/sstephenson/rbenv.git ~/.rbenv
-% git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build
-% echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile
-% echo 'eval "$(rbenv init -)"' >> ~/.bash_profile
-% exec ${SHELL} --login
-% sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel
-% rbenv install 2.4.1
-% rbenv global 2.4.1
-% gem install bundler
-% (cd c_glib && bundle install)
+$ sudo yum install -y git
+$ git clone https://github.com/sstephenson/rbenv.git ~/.rbenv
+$ git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build
+$ echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile
+$ echo 'eval "$(rbenv init -)"' >> ~/.bash_profile
+$ exec ${SHELL} --login
+$ sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel
+$ latest_ruby_version=$(rbenv install --list 2>&1 | grep '^[0-9]' | tail -n1)
+$ rbenv install ${latest_ruby_version}
+$ rbenv global ${latest_ruby_version}
+$ gem install bundler
+$ (cd c_glib && bundle install)
 ```
 
 On macOS with [Homebrew](https://brew.sh/):
 
 ```console
-% (cd c_glib && bundle install)
+$ (cd c_glib && bundle install)
 ```
 
 Now, you can run unit tests by the followings:
 
 ```console
-% cd c_glib
-% bundle exec test/run-test.sh
+$ cd c_glib.build
+$ bundle exec ../c_glib/test/run-test.sh
 ```
 
 ## Common build problems
 
-### configure failed - `AX_CXX_COMPILE_STDCXX_11(ext, mandatory)'
+### build failed - /usr/bin/ld: cannot find -larrow
 
-* Check whether `autoconf-archive` is installed.
-* [macOS] `autoconf-archive` must be linked, but may not be linked. You can check it by running `brew install autoconf-archive` again. If it's not linked, it will show a warning message like:
+Arrow C++ must be installed to build Arrow GLib. Run `make install` on
+Arrow C++ build directory. In addition, on linux, you may need to run
+`sudo ldconfig`.
 
-```console
-% brew install autoconf-archive
-Warning: autoconf-archive 2017.03.21 is already installed, it's just not linked.
-You can use `brew link autoconf-archive` to link this version.
-```
+### build failed - unable to load http://docbook.sourceforge.net/release/xsl/current/html/chunk.xsl
 
-In this case, you need to run `brew link autoconf-archive`. It may fail with the following message if you have install conflicted packages (e.g. `gnome-common`).
+You need to set the following environment variable on macOS:
 
 ```console
-% brew link autoconf-archive
-Linking /usr/local/Cellar/autoconf-archive/2017.03.21...
-Error: Could not symlink share/aclocal/ax_check_enable_debug.m4
-Target /usr/local/share/aclocal/ax_check_enable_debug.m4
-is a symlink belonging to gnome-common. You can unlink it:
-  brew unlink gnome-common
+$ export XML_CATALOG_FILES="$(brew --prefix)/etc/xml/catalog"
 ```
 
-You need to run `brew unlink <pkgname>`, then run `brew link autoconf-archive` again.
-
-After installing/linking `autoconf-archive`, run `./autogen.sh` again.
+### build failed - Symbol not found, referenced from `libsource-highlight.4.dylib`
 
-### [macOS] configure failed - gobject-introspection-1.0 is not installed
+You may get the following error on macOS:
 
-gobject-introspection requires libffi, and it's automatically installed with gobject-introspection. However it can't be found because it's [keg-only](https://docs.brew.sh/FAQ.html#what-does-keg-only-mean). You need to set `PKG_CONFIG_PATH` when executing configure.
 
-```console
-% ./configure PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig
+```text
+dyld: Symbol not found: __ZN5boost16re_detail_10650112perl_matcherIPKcNSt3__19allocatorINS_9sub_matchIS3_EEEENS_12regex_traitsIcNS_16cpp_regex_traitsIcEEEEE14construct_initERKNS_11basic_regexIcSC_EENS_15regex_constants12_match_flagsE
+  Referenced from: /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib
+  Expected in: flat namespace
+ in /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib
 ```
 
-### build failed - /usr/bin/ld: cannot find -larrow
-
-Arrow C++ must be installed to build Arrow GLib. Run `make install` on Arrow C++ build directory. In addition, on linux, you may need to run `sudo ldconfig`.
-
-### build failed - unable to load http://docbook.sourceforge.net/release/xsl/current/html/chunk.xsl
-
-On macOS you may need to set the following environment variable:
+To fix this error, you need to upgrade `source-highlight`:
 
 ```console
-% export XML_CATALOG_FILES="/usr/local/etc/xml/catalog"
+$ brew upgrade source-highlight
 ```
 
-### build failed - Symbol not found, referenced from `libsource-highlight.4.dylib`
+### test failed - Failed to load shared library '...' referenced by the typelib: dlopen(...): dependent dylib '@rpath/...' not found for '...'. relative file paths not allowed '@rpath/...'
 
-On macOS if you see the following error you may need to upgrade `source-highlight`
+You may get the following error on macOS by running test:
 
-```console
-dyld: Symbol not found: __ZN5boost16re_detail_10650112perl_matcherIPKcNSt3__19allocatorINS_9sub_matchIS3_EEEENS_12regex_traitsIcNS_16cpp_regex_traitsIcEEEEE14construct_initERKNS_11basic_regexIcSC_EENS_15regex_constants12_match_flagsE
-  Referenced from: /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib
-  Expected in: flat namespace
- in /usr/local/Cellar/source-highlight/3.1.8_7/lib/libsource-highlight.4.dylib
+```text
+(NULL)-WARNING **: Failed to load shared library '/usr/local/lib/libparquet-glib.400.dylib' referenced by the typelib: dlopen(/usr/local/lib/libparquet-glib.400.dylib, 0x0009): dependent dylib '@rpath/libparquet.400.dylib' not found for '/usr/local/lib/libparquet-glib.400.dylib'. relative file paths not allowed '@rpath/libparquet.400.dylib'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:215:in `load_object_info'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:68:in `load_info'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:43:in `block in load'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:34:in `block (2 levels) in each'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:33:in `times'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:33:in `block in each'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:32:in `each'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/repository.rb:32:in `each'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:42:in `load'
+        from /Library/Ruby/Gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection.rb:44:in `load'
+        from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/run-test.rb:60:in `<main>'
+Traceback (most recent call last):
+        17: from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/run-test.rb:80:in `<main>'
+        16: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:66:in `run'
+        15: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:434:in `run'
+        14: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/autorunner.rb:106:in `block in <class:AutoRunner>'
+        13: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:38:in `collect'
+        12: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:136:in `add_load_path'
+        11: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:43:in `block in collect'
+        10: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:43:in `each'
+         9: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:46:in `block (2 levels) in collect'
+         8: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:85:in `collect_recursive'
+         7: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:85:in `each'
+         6: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:87:in `block in collect_recursive'
+         5: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:112:in `collect_file'
+         4: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:136:in `add_load_path'
+         3: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:114:in `block in collect_file'
+         2: from /Library/Ruby/Gems/2.6.0/gems/test-unit-3.4.0/lib/test/unit/collector/load.rb:114:in `require'
+         1: from /Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/test-extension-data-type.rb:18:in `<top (required)>'
+/Users/karlkatzen/Documents/code/arrow-dev/arrow/c_glib/test/test-extension-data-type.rb:19:in `<class:TestExtensionDataType>': uninitialized constant Arrow::ExtensionArray (NameError)
 ```
 
-To fix do:
+You can't use `@rpath` in Arrow C++. To fix this error, you need to
+build Arrow C++ with `-DARROW_INSTALL_NAME_RPATH=OFF`:
 
 ```console
-% brew upgrade source-highlight
+$ cmake -S cpp -B cpp.build -DARROW_INSTALL_NAME_RPATH=OFF ...
+$ cmake --build cpp.build
+$ sudo cmake --build cpp.build --target install
 ```
diff --git a/c_glib/arrow-cuda-glib/Makefile.am b/c_glib/arrow-cuda-glib/Makefile.am
deleted file mode 100644
index bcf20bb549e..00000000000
--- a/c_glib/arrow-cuda-glib/Makefile.am
+++ /dev/null
@@ -1,130 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CLEANFILES =
-
-EXTRA_DIST =					\
-	meson.build
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-if HAVE_ARROW_CUDA
-lib_LTLIBRARIES =				\
-	libarrow-cuda-glib.la
-
-libarrow_cuda_glib_la_CXXFLAGS =		\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_CUDA_CFLAGS)			\
-	$(GARROW_CXXFLAGS)
-
-libarrow_cuda_glib_la_LDFLAGS =			\
-	-version-info $(LT_VERSION_INFO)	\
-	-no-undefined
-
-libarrow_cuda_glib_la_LIBADD =			\
-	$(GLIB_LIBS)				\
-	../arrow-glib/libarrow-glib.la		\
-	$(ARROW_CUDA_LIBS)
-
-libarrow_cuda_glib_la_headers =			\
-	arrow-cuda-glib.h			\
-	cuda.h
-
-libarrow_cuda_glib_la_sources =			\
-	cuda.cpp				\
-	$(libarrow_cuda_glib_la_headers)
-
-libarrow_cuda_glib_la_cpp_headers =		\
-	arrow-cuda-glib.hpp			\
-	cuda.hpp
-
-libarrow_cuda_glib_la_SOURCES =			\
-	$(libarrow_cuda_glib_la_sources)	\
-	$(libarrow_cuda_glib_la_cpp_headers)
-
-arrow_cuda_glib_includedir =			\
-	$(includedir)/arrow-cuda-glib
-arrow_cuda_glib_include_HEADERS =		\
-	$(libarrow_cuda_glib_la_headers)	\
-	$(libarrow_cuda_glib_la_cpp_headers)
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA =				\
-	arrow-cuda-glib.pc
-
-if HAVE_INTROSPECTION
--include $(INTROSPECTION_MAKEFILE)
-INTROSPECTION_GIRS =
-INTROSPECTION_SCANNER_ARGS =
-INTROSPECTION_SCANNER_ENV =
-if USE_ARROW_BUILD_DIR
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH}
-else
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH}
-endif
-INTROSPECTION_COMPILER_ARGS =			\
-	--includedir=$(abs_builddir)/../arrow-glib
-
-ArrowCUDA-1.0.gir: libarrow-cuda-glib.la
-ArrowCUDA_1_0_gir_PACKAGES =			\
-	arrow-glib
-ArrowCUDA_1_0_gir_EXPORT_PACKAGES =		\
-	arrow-cuda-glib
-ArrowCUDA_1_0_gir_INCLUDES =			\
-	Arrow-1.0
-ArrowCUDA_1_0_gir_CFLAGS =			\
-	$(AM_CPPFLAGS)
-ArrowCUDA_1_0_gir_LIBS =
-ArrowCUDA_1_0_gir_FILES =			\
-	$(libarrow_cuda_glib_la_sources)
-ArrowCUDA_1_0_gir_SCANNERFLAGS =				\
-	--library-path=$(ARROW_LIB_DIR)				\
-	--warn-all						\
-	--add-include-path=$(abs_builddir)/../arrow-glib	\
-	--identifier-prefix=GArrowCUDA				\
-	--symbol-prefix=garrow_cuda
-if OS_MACOS
-ArrowCUDA_1_0_gir_LIBS +=			\
-	 arrow-glib				\
-	 arrow-cuda-glib
-ArrowCUDA_1_0_gir_SCANNERFLAGS +=				\
-	--no-libtool						\
-	--library-path=$(abs_builddir)/../arrow-glib/.libs	\
-	--library-path=$(abs_builddir)/.libs
-else
-ArrowCUDA_1_0_gir_LIBS +=				\
-	$(abs_builddir)/../arrow-glib/libarrow-glib.la	\
-	libarrow-cuda-glib.la
-endif
-
-INTROSPECTION_GIRS += ArrowCUDA-1.0.gir
-
-girdir = $(datadir)/gir-1.0
-gir_DATA = $(INTROSPECTION_GIRS)
-
-typelibdir = $(libdir)/girepository-1.0
-typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib)
-
-CLEANFILES +=					\
-	$(gir_DATA)				\
-	$(typelib_DATA)
-endif
-endif
diff --git a/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in b/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in
deleted file mode 100644
index de0ce974c7a..00000000000
--- a/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Apache Arrow CUDA GLib
-Description: C API for Apache Arrow CUDA based on GLib
-Version: @VERSION@
-Libs: -L${libdir} -larrow-cuda-glib
-Cflags: -I${includedir}
-Requires: arrow-glib arrow-cuda
diff --git a/c_glib/arrow-dataset-glib/Makefile.am b/c_glib/arrow-dataset-glib/Makefile.am
deleted file mode 100644
index 81e5aa5f083..00000000000
--- a/c_glib/arrow-dataset-glib/Makefile.am
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CLEANFILES =
-
-EXTRA_DIST =					\
-	meson.build
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-if HAVE_ARROW_DATASET
-lib_LTLIBRARIES =				\
-	libarrow-dataset-glib.la
-
-libarrow_dataset_glib_la_CXXFLAGS =		\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_DATASET_CFLAGS)			\
-	$(GARROW_CXXFLAGS)
-
-libarrow_dataset_glib_la_LDFLAGS =		\
-	-version-info $(LT_VERSION_INFO)	\
-	-no-undefined
-
-libarrow_dataset_glib_la_LIBADD =		\
-	$(GLIB_LIBS)				\
-	../arrow-glib/libarrow-glib.la		\
-	$(ARROW_DATASET_LIBS)
-
-libarrow_dataset_glib_la_headers =		\
-	arrow-dataset-glib.h			\
-	file-format.h				\
-	fragment.h				\
-	scanner.h
-
-libarrow_dataset_glib_la_sources =		\
-	file-format.cpp				\
-	fragment.cpp				\
-	scanner.cpp				\
-	$(libarrow_dataset_glib_la_headers)
-
-libarrow_dataset_glib_la_cpp_headers =		\
-	arrow-dataset-glib.hpp			\
-	file-format.hpp				\
-	fragment.hpp				\
-	scanner.hpp
-
-libarrow_dataset_glib_la_SOURCES =		\
-	$(libarrow_dataset_glib_la_sources)	\
-	$(libarrow_dataset_glib_la_cpp_headers)
-
-arrow_dataset_glib_includedir =			\
-	$(includedir)/arrow-dataset-glib
-arrow_dataset_glib_include_HEADERS =		\
-	$(libarrow_dataset_glib_la_headers)	\
-	$(libarrow_dataset_glib_la_cpp_headers)
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA =				\
-	arrow-dataset-glib.pc
-
-if HAVE_INTROSPECTION
--include $(INTROSPECTION_MAKEFILE)
-INTROSPECTION_GIRS =
-INTROSPECTION_SCANNER_ARGS =
-INTROSPECTION_SCANNER_ENV =
-if USE_ARROW_BUILD_DIR
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH}
-else
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH}
-endif
-INTROSPECTION_COMPILER_ARGS =			\
-	--includedir=$(abs_builddir)/../arrow-glib
-
-ArrowDataset-1.0.gir: libarrow-dataset-glib.la
-ArrowDataset_1_0_gir_PACKAGES =			\
-	arrow-glib
-ArrowDataset_1_0_gir_EXPORT_PACKAGES =		\
-	arrow-dataset-glib
-ArrowDataset_1_0_gir_INCLUDES =			\
-	Arrow-1.0
-ArrowDataset_1_0_gir_CFLAGS =			\
-	$(AM_CPPFLAGS)
-ArrowDataset_1_0_gir_LIBS =
-ArrowDataset_1_0_gir_FILES =			\
-	$(libarrow_dataset_glib_la_sources)
-ArrowDataset_1_0_gir_SCANNERFLAGS =				\
-	--add-include-path=$(abs_builddir)/../arrow-glib	\
-	--identifier-prefix=GAD					\
-	--library-path=$(ARROW_LIB_DIR)				\
-	--symbol-prefix=gad					\
-	--warn-all
-if OS_MACOS
-ArrowDataset_1_0_gir_LIBS +=			\
-	 arrow-glib				\
-	 arrow-dataset-glib
-ArrowDataset_1_0_gir_SCANNERFLAGS +=				\
-	--no-libtool						\
-	--library-path=$(abs_builddir)/../arrow-glib/.libs	\
-	--library-path=$(abs_builddir)/.libs
-else
-ArrowDataset_1_0_gir_LIBS +=				\
-	$(abs_builddir)/../arrow-glib/libarrow-glib.la	\
-	libarrow-dataset-glib.la
-endif
-
-INTROSPECTION_GIRS += ArrowDataset-1.0.gir
-
-girdir = $(datadir)/gir-1.0
-gir_DATA = $(INTROSPECTION_GIRS)
-
-typelibdir = $(libdir)/girepository-1.0
-typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib)
-
-CLEANFILES +=					\
-	$(gir_DATA)				\
-	$(typelib_DATA)
-endif
-endif
diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am
deleted file mode 100644
index 9f19578d537..00000000000
--- a/c_glib/arrow-glib/Makefile.am
+++ /dev/null
@@ -1,309 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CLEANFILES =
-DISTCLEANFILES =
-
-EXTRA_DIST =					\
-	meson.build
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(GARROW_CFLAGS)
-
-# libarrow-glib
-lib_LTLIBRARIES =				\
-	libarrow-glib.la
-
-libarrow_glib_la_CXXFLAGS =			\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_CFLAGS)				\
-	$(GARROW_CXXFLAGS)
-
-libarrow_glib_la_LDFLAGS =			\
-	-version-info $(LT_VERSION_INFO)	\
-	-no-undefined
-
-libarrow_glib_la_LIBADD =			\
-	$(GLIB_LIBS)				\
-	$(ARROW_LIBS)
-
-libarrow_glib_la_headers =			\
-	array.h					\
-	array-builder.h				\
-	arrow-glib.h				\
-	basic-array.h				\
-	basic-data-type.h			\
-	buffer.h				\
-	chunked-array.h				\
-	codec.h					\
-	composite-array.h			\
-	composite-data-type.h			\
-	data-type.h				\
-	datum.h					\
-	decimal.h				\
-	error.h					\
-	field.h					\
-	gobject-type.h				\
-	record-batch.h				\
-	schema.h				\
-	table.h					\
-	table-builder.h				\
-	tensor.h				\
-	type.h
-
-libarrow_glib_la_headers +=			\
-	file.h					\
-	file-mode.h				\
-	input-stream.h				\
-	output-stream.h				\
-	readable.h				\
-	writable.h				\
-	writable-file.h
-
-libarrow_glib_la_headers +=			\
-	ipc-options.h				\
-	metadata-version.h			\
-	reader.h				\
-	writer.h
-
-libarrow_glib_la_headers +=			\
-	compute.h
-
-libarrow_glib_la_headers +=			\
-	file-system.h				\
-	local-file-system.h
-
-if HAVE_ARROW_ORC
-libarrow_glib_la_headers +=			\
-	orc-file-reader.h
-endif
-
-libarrow_glib_la_generated_headers =		\
-	enums.h					\
-	version.h
-
-libarrow_glib_la_generated_sources =		\
-	enums.c					\
-	$(libarrow_glib_la_generated_headers)
-
-libarrow_glib_la_sources =			\
-	array-builder.cpp			\
-	basic-array.cpp				\
-	basic-data-type.cpp			\
-	buffer.cpp				\
-	chunked-array.cpp			\
-	codec.cpp				\
-	composite-array.cpp			\
-	composite-data-type.cpp			\
-	datum.cpp				\
-	decimal.cpp				\
-	error.cpp				\
-	field.cpp				\
-	record-batch.cpp			\
-	schema.cpp				\
-	table.cpp				\
-	table-builder.cpp			\
-	tensor.cpp				\
-	type.cpp				\
-	$(libarrow_glib_la_headers)		\
-	$(libarrow_glib_la_generated_sources)
-
-libarrow_glib_la_sources +=			\
-	file.cpp				\
-	file-mode.cpp				\
-	input-stream.cpp			\
-	output-stream.cpp			\
-	readable.cpp				\
-	writable.cpp				\
-	writable-file.cpp
-
-libarrow_glib_la_sources +=			\
-	ipc-options.cpp				\
-	metadata-version.cpp			\
-	reader.cpp				\
-	writer.cpp
-
-libarrow_glib_la_sources +=			\
-	compute.cpp
-
-libarrow_glib_la_sources +=			\
-	file-system.cpp				\
-	local-file-system.cpp
-
-if HAVE_ARROW_ORC
-libarrow_glib_la_sources +=			\
-	orc-file-reader.cpp
-endif
-
-libarrow_glib_la_cpp_headers =			\
-	array.hpp				\
-	array-builder.hpp			\
-	arrow-glib.hpp				\
-	basic-array.hpp				\
-	basic-data-type.hpp			\
-	buffer.hpp				\
-	chunked-array.hpp			\
-	codec.hpp				\
-	data-type.hpp				\
-	datum.hpp				\
-	decimal.hpp				\
-	error.hpp				\
-	field.hpp				\
-	record-batch.hpp			\
-	schema.hpp				\
-	table.hpp				\
-	table-builder.hpp			\
-	tensor.hpp				\
-	type.hpp
-
-libarrow_glib_la_cpp_headers +=			\
-	file.hpp				\
-	file-mode.hpp				\
-	input-stream.hpp			\
-	output-stream.hpp			\
-	readable.hpp				\
-	writable.hpp				\
-	writable-file.hpp
-
-libarrow_glib_la_cpp_headers +=			\
-	ipc-options.hpp				\
-	metadata-version.hpp			\
-	reader.hpp				\
-	writer.hpp
-
-libarrow_glib_la_cpp_headers +=			\
-	compute.hpp
-
-libarrow_glib_la_cpp_headers +=			\
-	file-system.hpp				\
-	local-file-system.hpp
-
-if HAVE_ARROW_ORC
-libarrow_glib_la_cpp_headers +=			\
-	orc-file-reader.hpp
-endif
-
-libarrow_glib_la_cpp_internal_headers =		\
-	internal-hash-table.hpp			\
-	internal-index.hpp
-
-libarrow_glib_la_SOURCES =			\
-	$(libarrow_glib_la_sources)		\
-	$(libarrow_glib_la_cpp_headers)		\
-	$(libarrow_glib_la_cpp_internal_headers)
-
-BUILT_SOURCES =					\
-	$(libarrow_glib_la_generated_headers)	\
-	$(libarrow_glib_la_generated_sources)	\
-	stamp-enums.c				\
-	stamp-enums.h
-
-DISTCLEANFILES +=				\
-	stamp-enums.c				\
-	stamp-enums.h
-
-EXTRA_DIST +=					\
-	enums.c.template			\
-	enums.h.template
-
-enums.h: stamp-enums.h
-	@true
-stamp-enums.h: $(libarrow_glib_la_headers) enums.h.template
-	$(AM_V_GEN)					\
-	  (cd $(srcdir) &&				\
-	   $(GLIB_MKENUMS)				\
-	     --identifier-prefix GArrow			\
-	     --symbol-prefix garrow			\
-	     --template enums.h.template		\
-	     $(libarrow_glib_la_headers)) > enums.h
-	touch $@
-
-enums.c: stamp-enums.c
-	@true
-stamp-enums.c: $(libarrow_glib_la_headers) enums.c.template
-	$(AM_V_GEN)					\
-	  (cd $(srcdir) &&				\
-	   $(GLIB_MKENUMS)				\
-	     --identifier-prefix GArrow			\
-	     --symbol-prefix garrow			\
-	     --template enums.c.template		\
-	     $(libarrow_glib_la_headers)) > enums.c
-	touch $@
-
-arrow_glib_includedir = $(includedir)/arrow-glib
-arrow_glib_include_HEADERS =			\
-	$(libarrow_glib_la_headers)		\
-	$(libarrow_glib_la_cpp_headers)		\
-	$(libarrow_glib_la_generated_headers)
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA =				\
-	arrow-glib.pc
-
-if HAVE_ARROW_ORC
-pkgconfig_DATA +=				\
-	arrow-orc-glib.pc
-endif
-
-if HAVE_INTROSPECTION
--include $(INTROSPECTION_MAKEFILE)
-INTROSPECTION_GIRS =
-INTROSPECTION_SCANNER_ARGS =
-INTROSPECTION_SCANNER_ENV =
-INTROSPECTION_COMPILER_ARGS =
-
-Arrow-1.0.gir: libarrow-glib.la
-Arrow_1_0_gir_PACKAGES =			\
-	gio-2.0
-Arrow_1_0_gir_EXPORT_PACKAGES =			\
-	arrow-glib
-Arrow_1_0_gir_INCLUDES =			\
-	Gio-2.0
-Arrow_1_0_gir_CFLAGS =				\
-	$(AM_CPPFLAGS)
-Arrow_1_0_gir_LIBS =
-Arrow_1_0_gir_FILES = $(libarrow_glib_la_sources)
-Arrow_1_0_gir_SCANNERFLAGS =			\
-	--library-path=$(ARROW_LIB_DIR)		\
-	--warn-all				\
-	--identifier-prefix=GArrow		\
-	--symbol-prefix=garrow
-if OS_MACOS
-Arrow_1_0_gir_LIBS += arrow-glib
-Arrow_1_0_gir_SCANNERFLAGS +=			\
-	--no-libtool				\
-	--library-path=$(abs_builddir)/.libs
-else
-Arrow_1_0_gir_LIBS += libarrow-glib.la
-endif
-INTROSPECTION_GIRS += Arrow-1.0.gir
-
-girdir = $(datadir)/gir-1.0
-gir_DATA = $(INTROSPECTION_GIRS)
-
-typelibdir = $(libdir)/girepository-1.0
-typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib)
-
-CLEANFILES +=					\
-	$(gir_DATA)				\
-	$(typelib_DATA)
-endif
diff --git a/c_glib/arrow-glib/arrow-glib.pc.in b/c_glib/arrow-glib/arrow-glib.pc.in
deleted file mode 100644
index f9f27b24990..00000000000
--- a/c_glib/arrow-glib/arrow-glib.pc.in
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Apache Arrow GLib
-Description: C API for Apache Arrow based on GLib
-Version: @VERSION@
-Libs: -L${libdir} -larrow-glib
-Cflags: -I${includedir}
-Requires: gobject-2.0 arrow
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index db6fa544069..636a3c74707 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -1479,10 +1479,12 @@ garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
 
 typedef struct GArrowCSVReaderPrivate_ {
   std::shared_ptr<arrow::csv::TableReader> reader;
+  GArrowInputStream *input;
 } GArrowCSVReaderPrivate;
 
 enum {
-  PROP_CSV_TABLE_READER = 1
+  PROP_CSV_TABLE_READER = 1,
+  PROP_CSV_READER_INPUT,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReader,
@@ -1499,11 +1501,24 @@ garrow_csv_reader_dispose(GObject *object)
 {
   auto priv = GARROW_CSV_READER_GET_PRIVATE(object);
 
-  priv->reader = nullptr;
+  if (priv->input) {
+    g_object_unref(priv->input);
+    priv->input = nullptr;
+  }
 
   G_OBJECT_CLASS(garrow_csv_reader_parent_class)->dispose(object);
 }
 
+static void
+garrow_csv_reader_finalize(GObject *object)
+{
+  auto priv = GARROW_CSV_READER_GET_PRIVATE(object);
+
+  priv->reader.~shared_ptr();
+
+  G_OBJECT_CLASS(garrow_csv_reader_parent_class)->finalize(object);
+}
+
 static void
 garrow_csv_reader_set_property(GObject *object,
                                guint prop_id,
@@ -1517,6 +1532,9 @@ garrow_csv_reader_set_property(GObject *object,
     priv->reader =
       *static_cast<std::shared_ptr<arrow::csv::TableReader> *>(g_value_get_pointer(value));
     break;
+  case PROP_CSV_READER_INPUT:
+    priv->input = GARROW_INPUT_STREAM(g_value_dup_object(value));
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -1529,7 +1547,12 @@ garrow_csv_reader_get_property(GObject *object,
                                GValue *value,
                                GParamSpec *pspec)
 {
+  auto priv = GARROW_CSV_READER_GET_PRIVATE(object);
+
   switch (prop_id) {
+  case PROP_CSV_READER_INPUT:
+    g_value_set_object(value, priv->input);
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -1539,25 +1562,37 @@ garrow_csv_reader_get_property(GObject *object,
 static void
 garrow_csv_reader_init(GArrowCSVReader *object)
 {
+  auto priv = GARROW_CSV_READER_GET_PRIVATE(object);
+  new(&priv->reader) std::shared_ptr<arrow::csv::TableReader>;
 }
 
 static void
 garrow_csv_reader_class_init(GArrowCSVReaderClass *klass)
 {
-  GParamSpec *spec;
-
   auto gobject_class = G_OBJECT_CLASS(klass);
 
   gobject_class->dispose      = garrow_csv_reader_dispose;
+  gobject_class->finalize     = garrow_csv_reader_finalize;
   gobject_class->set_property = garrow_csv_reader_set_property;
   gobject_class->get_property = garrow_csv_reader_get_property;
 
+  GParamSpec *spec;
   spec = g_param_spec_pointer("csv-table-reader",
                               "CSV table reader",
                               "The raw std::shared<arrow::csv::TableReader> *",
                               static_cast<GParamFlags>(G_PARAM_WRITABLE |
                                                        G_PARAM_CONSTRUCT_ONLY));
   g_object_class_install_property(gobject_class, PROP_CSV_TABLE_READER, spec);
+
+  spec = g_param_spec_object("input",
+                             "Input",
+                             "The input stream to be read",
+                             GARROW_TYPE_INPUT_STREAM,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class,
+                                  PROP_CSV_READER_INPUT,
+                                  spec);
 }
 
 /**
@@ -1597,7 +1632,7 @@ garrow_csv_reader_new(GArrowInputStream *input,
                                   parse_options,
                                   convert_options);
   if (garrow::check(error, arrow_reader, "[csv-reader][new]")) {
-    return garrow_csv_reader_new_raw(&(arrow_reader.ValueOrDie()));
+    return garrow_csv_reader_new_raw(&(*arrow_reader), input);
   } else {
     return NULL;
   }
@@ -1633,11 +1668,11 @@ typedef struct GArrowJSONReadOptionsPrivate_ {
 } GArrowJSONReadOptionsPrivate;
 
 enum {
-  PROP_JSON_READER_USE_THREADS = 1,
-  PROP_JSON_READER_BLOCK_SIZE,
-  PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES,
-  PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR,
-  PROP_JSON_READER_SCHEMA
+  PROP_JSON_READ_OPTIONS_USE_THREADS = 1,
+  PROP_JSON_READ_OPTIONS_BLOCK_SIZE,
+  PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES,
+  PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR,
+  PROP_JSON_READ_OPTIONS_SCHEMA,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowJSONReadOptions,
@@ -1671,20 +1706,20 @@ garrow_json_read_options_set_property(GObject *object,
   auto priv = GARROW_JSON_READ_OPTIONS_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_JSON_READER_USE_THREADS:
+  case PROP_JSON_READ_OPTIONS_USE_THREADS:
     priv->read_options.use_threads = g_value_get_boolean(value);
     break;
-  case PROP_JSON_READER_BLOCK_SIZE:
+  case PROP_JSON_READ_OPTIONS_BLOCK_SIZE:
     priv->read_options.block_size = g_value_get_int(value);
     break;
-  case PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES:
+  case PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES:
     priv->parse_options.newlines_in_values = g_value_get_boolean(value);
     break;
-  case PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR:
+  case PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR:
     priv->parse_options.unexpected_field_behavior =
       static_cast<arrow::json::UnexpectedFieldBehavior>(g_value_get_enum(value));
     break;
-  case PROP_JSON_READER_SCHEMA:
+  case PROP_JSON_READ_OPTIONS_SCHEMA:
     {
       auto schema = g_value_dup_object(value);
       if (priv->schema) {
@@ -1714,19 +1749,19 @@ garrow_json_read_options_get_property(GObject *object,
   auto priv = GARROW_JSON_READ_OPTIONS_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_JSON_READER_USE_THREADS:
+  case PROP_JSON_READ_OPTIONS_USE_THREADS:
     g_value_set_boolean(value, priv->read_options.use_threads);
     break;
-  case PROP_JSON_READER_BLOCK_SIZE:
+  case PROP_JSON_READ_OPTIONS_BLOCK_SIZE:
     g_value_set_int(value, priv->read_options.block_size);
     break;
-  case PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES:
+  case PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES:
     g_value_set_boolean(value, priv->parse_options.newlines_in_values);
     break;
-  case PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR:
+  case PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR:
     g_value_set_enum(value, static_cast<int>(priv->parse_options.unexpected_field_behavior));
     break;
-  case PROP_JSON_READER_SCHEMA:
+  case PROP_JSON_READ_OPTIONS_SCHEMA:
     g_value_set_object(value, priv->schema);
     break;
   default:
@@ -1769,7 +1804,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass)
                               read_options.use_threads,
                               static_cast<GParamFlags>(G_PARAM_READWRITE));
   g_object_class_install_property(gobject_class,
-                                  PROP_JSON_READER_USE_THREADS,
+                                  PROP_JSON_READ_OPTIONS_USE_THREADS,
                                   spec);
 
   /**
@@ -1790,7 +1825,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass)
                           read_options.block_size,
                           static_cast<GParamFlags>(G_PARAM_READWRITE));
   g_object_class_install_property(gobject_class,
-                                  PROP_JSON_READER_BLOCK_SIZE,
+                                  PROP_JSON_READ_OPTIONS_BLOCK_SIZE,
                                   spec);
 
 
@@ -1812,7 +1847,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass)
                               parse_options.newlines_in_values,
                               static_cast<GParamFlags>(G_PARAM_READWRITE));
   g_object_class_install_property(gobject_class,
-                                  PROP_JSON_READER_ALLOW_NEWLINES_IN_VALUES,
+                                  PROP_JSON_READ_OPTIONS_ALLOW_NEWLINES_IN_VALUES,
                                   spec);
 
   /**
@@ -1829,7 +1864,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass)
                            GARROW_JSON_READ_INFER_TYPE,
                            static_cast<GParamFlags>(G_PARAM_READWRITE));
   g_object_class_install_property(gobject_class,
-                                  PROP_JSON_READER_UNEXPECTED_FIELD_BEHAVIOR,
+                                  PROP_JSON_READ_OPTIONS_UNEXPECTED_FIELD_BEHAVIOR,
                                   spec);
 
   /**
@@ -1845,7 +1880,7 @@ garrow_json_read_options_class_init(GArrowJSONReadOptionsClass *klass)
                               GARROW_TYPE_SCHEMA,
                               static_cast<GParamFlags>(G_PARAM_READWRITE));
   g_object_class_install_property(gobject_class,
-                                  PROP_JSON_READER_SCHEMA,
+                                  PROP_JSON_READ_OPTIONS_SCHEMA,
                                   spec);
 }
 
@@ -1866,10 +1901,12 @@ garrow_json_read_options_new(void)
 
 typedef struct GArrowJSONReaderPrivate_ {
   std::shared_ptr<arrow::json::TableReader> reader;
+  GArrowInputStream *input;
 } GArrowJSONReaderPrivate;
 
 enum {
-  PROP_JSON_TABLE_READER = 1
+  PROP_JSON_TABLE_READER = 1,
+  PROP_JSON_READER_INPUT,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowJSONReader,
@@ -1886,11 +1923,24 @@ garrow_json_reader_dispose(GObject *object)
 {
   auto priv = GARROW_JSON_READER_GET_PRIVATE(object);
 
-  priv->reader = nullptr;
+  if (priv->input) {
+    g_object_unref(priv->input);
+    priv->input = nullptr;
+  }
 
   G_OBJECT_CLASS(garrow_json_reader_parent_class)->dispose(object);
 }
 
+static void
+garrow_json_reader_finalize(GObject *object)
+{
+  auto priv = GARROW_JSON_READER_GET_PRIVATE(object);
+
+  priv->reader.~shared_ptr();
+
+  G_OBJECT_CLASS(garrow_json_reader_parent_class)->finalize(object);
+}
+
 static void
 garrow_json_reader_set_property(GObject *object,
                                 guint prop_id,
@@ -1904,6 +1954,9 @@ garrow_json_reader_set_property(GObject *object,
     priv->reader =
       *static_cast<std::shared_ptr<arrow::json::TableReader> *>(g_value_get_pointer(value));
     break;
+  case PROP_JSON_READER_INPUT:
+    priv->input = GARROW_INPUT_STREAM(g_value_dup_object(value));
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -1916,7 +1969,12 @@ garrow_json_reader_get_property(GObject *object,
                                 GValue *value,
                                 GParamSpec *pspec)
 {
+  auto priv = GARROW_JSON_READER_GET_PRIVATE(object);
+
   switch (prop_id) {
+  case PROP_JSON_READER_INPUT:
+    g_value_set_object(value, priv->input);
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -1926,25 +1984,37 @@ garrow_json_reader_get_property(GObject *object,
 static void
 garrow_json_reader_init(GArrowJSONReader *object)
 {
+  auto priv = GARROW_JSON_READER_GET_PRIVATE(object);
+  new(&priv->reader) std::shared_ptr<arrow::json::TableReader>;
 }
 
 static void
 garrow_json_reader_class_init(GArrowJSONReaderClass *klass)
 {
-  GParamSpec *spec;
-
   auto gobject_class = G_OBJECT_CLASS(klass);
 
   gobject_class->dispose      = garrow_json_reader_dispose;
+  gobject_class->finalize     = garrow_json_reader_finalize;
   gobject_class->set_property = garrow_json_reader_set_property;
   gobject_class->get_property = garrow_json_reader_get_property;
 
+  GParamSpec *spec;
   spec = g_param_spec_pointer("json-table-reader",
                               "JSON table reader",
                               "The raw std::shared<arrow::json::TableReader> *",
                               static_cast<GParamFlags>(G_PARAM_WRITABLE |
                                                        G_PARAM_CONSTRUCT_ONLY));
   g_object_class_install_property(gobject_class, PROP_JSON_TABLE_READER, spec);
+
+  spec = g_param_spec_object("input",
+                             "Input",
+                             "The input stream to be read",
+                             GARROW_TYPE_INPUT_STREAM,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class,
+                                  PROP_JSON_READER_INPUT,
+                                  spec);
 }
 
 /**
@@ -1980,7 +2050,7 @@ garrow_json_reader_new(GArrowInputStream *input,
   }
 
   if (garrow::check(error, arrow_reader, "[json-reader][new]")) {
-    return garrow_json_reader_new_raw(&(arrow_reader.ValueOrDie()));
+    return garrow_json_reader_new_raw(&*arrow_reader, input);
   } else {
     return NULL;
   }
@@ -2085,10 +2155,12 @@ garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader)
 }
 
 GArrowCSVReader *
-garrow_csv_reader_new_raw(std::shared_ptr<arrow::csv::TableReader> *arrow_reader)
+garrow_csv_reader_new_raw(std::shared_ptr<arrow::csv::TableReader> *arrow_reader,
+                          GArrowInputStream *input)
 {
   auto reader = GARROW_CSV_READER(g_object_new(GARROW_TYPE_CSV_READER,
                                                "csv-table-reader", arrow_reader,
+                                               "input", input,
                                                NULL));
   return reader;
 }
@@ -2101,10 +2173,12 @@ garrow_csv_reader_get_raw(GArrowCSVReader *reader)
 }
 
 GArrowJSONReader *
-garrow_json_reader_new_raw(std::shared_ptr<arrow::json::TableReader> *arrow_reader)
+garrow_json_reader_new_raw(std::shared_ptr<arrow::json::TableReader> *arrow_reader,
+                           GArrowInputStream *input)
 {
   auto reader = GARROW_JSON_READER(g_object_new(GARROW_TYPE_JSON_READER,
                                                 "json-table-reader", arrow_reader,
+                                                "input", input,
                                                 NULL));
   return reader;
 }
diff --git a/c_glib/arrow-glib/reader.hpp b/c_glib/arrow-glib/reader.hpp
index c1df700fe13..c7b2b76f215 100644
--- a/c_glib/arrow-glib/reader.hpp
+++ b/c_glib/arrow-glib/reader.hpp
@@ -44,11 +44,13 @@ std::shared_ptr<arrow::ipc::feather::Reader>
 garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader);
 
 GArrowCSVReader *
-garrow_csv_reader_new_raw(std::shared_ptr<arrow::csv::TableReader> *arrow_reader);
+garrow_csv_reader_new_raw(std::shared_ptr<arrow::csv::TableReader> *arrow_reader,
+                          GArrowInputStream *input);
 std::shared_ptr<arrow::csv::TableReader>
 garrow_csv_reader_get_raw(GArrowCSVReader *reader);
 
 GArrowJSONReader *
-garrow_json_reader_new_raw(std::shared_ptr<arrow::json::TableReader> *arrow_reader);
+garrow_json_reader_new_raw(std::shared_ptr<arrow::json::TableReader> *arrow_reader,
+                           GArrowInputStream *input);
 std::shared_ptr<arrow::json::TableReader>
 garrow_json_reader_get_raw(GArrowJSONReader *reader);
diff --git a/c_glib/configure.ac b/c_glib/configure.ac
deleted file mode 100644
index 58c75b45002..00000000000
--- a/c_glib/configure.ac
+++ /dev/null
@@ -1,346 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-AC_PREREQ(2.65)
-
-m4_define([arrow_glib_version], 4.0.0-SNAPSHOT)
-AC_INIT([arrow-glib],
-        arrow_glib_version,
-        [https://issues.apache.org/jira/browse/ARROW],
-        [apache-arrow-glib])
-AC_CONFIG_AUX_DIR([config])
-AC_CONFIG_MACRO_DIR([m4])
-
-AC_CONFIG_SRCDIR([arrow-glib/arrow-glib.h])
-AC_CONFIG_HEADERS([config.h])
-
-AM_INIT_AUTOMAKE([1.13 foreign])
-AM_SILENT_RULES([yes])
-
-GARROW_VERSION_MAJOR=$(echo "arrow_glib_version" | \
-                         sed -E -e 's/^([[0-9]]+)\..+$/\1/' | \
-                         tr -d '\n')
-GARROW_VERSION_MINOR=$(echo "arrow_glib_version" | \
-                         sed -E -e 's/^[[0-9]]+\.([[0-9]]+)\..+$/\1/' | \
-                         tr -d '\n')
-GARROW_VERSION_MICRO=$(echo "arrow_glib_version" | \
-                         sed -E -e 's/^[[0-9]]+\.[[0-9]]+\.([[0-9]]+).*$/\1/' | \
-                         tr -d '\n')
-if echo "arrow_glib_version" | grep -- "-" > /dev/null; then
-  GARROW_VERSION_TAG=$(echo "arrow_glib_version" | \
-                         sed -E -e 's/^[[0-9]]+\.[[0-9]]+\.[[0-9]]+-(.+)$/\1/' | \
-                         tr -d '\n')
-else
-  GARROW_VERSION_TAG=
-fi
-AC_SUBST(GARROW_VERSION_MAJOR)
-AC_SUBST(GARROW_VERSION_MINOR)
-AC_SUBST(GARROW_VERSION_MICRO)
-AC_SUBST(GARROW_VERSION_TAG)
-
-GGANDIVA_VERSION_MAJOR=${GARROW_VERSION_MAJOR}
-GGANDIVA_VERSION_MINOR=${GARROW_VERSION_MINOR}
-GGANDIVA_VERSION_MICRO=${GARROW_VERSION_MICRO}
-GGANDIVA_VERSION_TAG=${GARROW_VERSION_TAG}
-AC_SUBST(GGANDIVA_VERSION_MAJOR)
-AC_SUBST(GGANDIVA_VERSION_MINOR)
-AC_SUBST(GGANDIVA_VERSION_MICRO)
-AC_SUBST(GGANDIVA_VERSION_TAG)
-
-GPARQUET_VERSION_MAJOR=${GARROW_VERSION_MAJOR}
-GPARQUET_VERSION_MINOR=${GARROW_VERSION_MINOR}
-GPARQUET_VERSION_MICRO=${GARROW_VERSION_MICRO}
-GPARQUET_VERSION_TAG=${GARROW_VERSION_TAG}
-AC_SUBST(GPARQUET_VERSION_MAJOR)
-AC_SUBST(GPARQUET_VERSION_MINOR)
-AC_SUBST(GPARQUET_VERSION_MICRO)
-AC_SUBST(GPARQUET_VERSION_TAG)
-
-AC_CANONICAL_HOST
-AC_MSG_CHECKING([for macOS])
-case "$host_os" in
-darwin*)
-  os_macos=yes
-  ;;
-*)
-  os_macos=no
-  ;;
-esac
-AC_MSG_RESULT([$os_macos])
-AM_CONDITIONAL(OS_MACOS, test "$os_macos" = "yes")
-
-LT_INIT
-LT_CURRENT=$(expr ${GARROW_VERSION_MAJOR} \* 100 + ${GARROW_VERSION_MINOR})
-LT_REVISION=${GARROW_VERSION_MICRO}
-LT_AGE=0
-LT_VERSION_INFO="\$(LT_CURRENT):\$(LT_REVISION):\$(LT_AGE)"
-AC_SUBST(LT_CURRENT)
-AC_SUBST(LT_REVISION)
-AC_SUBST(LT_AGE)
-AC_SUBST(LT_VERSION_INFO)
-
-AC_PROG_CC
-AC_PROG_CXX
-AX_CXX_COMPILE_STDCXX_11([ext], [mandatory])
-
-GARROW_CFLAGS="-Wall"
-GARROW_CXXFLAGS="-Wall"
-AC_ARG_ENABLE(debug,
-  [AS_HELP_STRING([--enable-debug],
-                  [Use debug flags (default=no)])],
-  [GARROW_DEBUG="$enableval"],
-  [GARROW_DEBUG="no"])
-if test "x$GARROW_DEBUG" != "xno"; then
-  GARROW_DEBUG="yes"
-  if test "$CLANG" = "yes"; then
-    CFLAGS="$CFLAGS -O0 -g"
-    CXXFLAGS="$CXXFLAGS -O0 -g"
-  elif test "$GCC" = "yes"; then
-    CFLAGS="$CFLAGS -O0 -g3"
-    CXXFLAGS="$CXXFLAGS -O0 -g3"
-  fi
-fi
-AC_ARG_ENABLE(development-mode,
-  [AS_HELP_STRING([--enable-development-mode],
-                  [Use development mode (default=no)])],
-  [GARROW_DEVELOPMENT_MODE="$enableval"],
-  [GARROW_DEVELOPMENT_MODE="no"])
-if test "x$GARROW_DEVELOPMENT_MODE" != "xno"; then
-  if test "$CLANG" = "yes" -o "$GCC" = "yes"; then
-    CFLAGS="$CFLAGS -Werror"
-    CXXFLAGS="$CXXFLAGS -Werror"
-  fi
-fi
-AC_SUBST(GARROW_CFLAGS)
-AC_SUBST(GARROW_CXXFLAGS)
-
-AM_PATH_GLIB_2_0([2.32.4],
-                 [],
-                 [AC_MSG_ERROR(GLib isn't available)],
-                 [gobject gio])
-
-GOBJECT_INTROSPECTION_CHECK([1.32.1])
-GTK_DOC_CHECK([1.18-2])
-
-AC_ARG_WITH(arrow-cpp-build-type,
-  [AS_HELP_STRING([--with-arrow-cpp-build-type=TYPE],
-                  [-DCMAKE_BUILD_TYPE option value for Arrow C++ (default=release)])],
-  [GARROW_ARROW_CPP_BUILD_TYPE="$withval"],
-  [GARROW_ARROW_CPP_BUILD_TYPE="release"])
-
-ARROW_CUDA_PKG_CONFIG_PATH=""
-AC_ARG_WITH(arrow-cpp-build-dir,
-  [AS_HELP_STRING([--with-arrow-cpp-build-dir=PATH],
-                  [Use this option to build with not installed Arrow C++])],
-  [GARROW_ARROW_CPP_BUILD_DIR="$withval"],
-  [GARROW_ARROW_CPP_BUILD_DIR=""])
-if test "x$GARROW_ARROW_CPP_BUILD_DIR" = "x"; then
-  USE_ARROW_BUILD_DIR=no
-
-  arrow_packages="arrow"
-  arrow_packages="${arrow_packages} arrow-compute"
-  arrow_packages="${arrow_packages} arrow-csv"
-  arrow_packages="${arrow_packages} arrow-filesystem"
-  arrow_packages="${arrow_packages} arrow-json"
-  PKG_CHECK_MODULES([ARROW], [${arrow_packages}])
-  _PKG_CONFIG(ARROW_LIB_DIR, [variable=libdir], [arrow])
-  ARROW_LIB_DIR="$pkg_cv_ARROW_LIB_DIR"
-  PKG_CHECK_MODULES([ARROW_ORC],
-                    [arrow-orc],
-                    [HAVE_ARROW_ORC=yes],
-                    [HAVE_ARROW_ORC=no])
-  PKG_CHECK_MODULES([ARROW_CUDA],
-                    [arrow-cuda],
-                    [HAVE_ARROW_CUDA=yes],
-                    [HAVE_ARROW_CUDA=no])
-  PKG_CHECK_MODULES([ARROW_DATASET],
-                    [arrow-dataset],
-                    [HAVE_ARROW_DATASET=yes],
-                    [HAVE_ARROW_DATASET=no])
-  PKG_CHECK_MODULES([GANDIVA],
-                    [gandiva],
-                    [HAVE_GANDIVA=yes],
-                    [HAVE_GANDIVA=no])
-  PKG_CHECK_MODULES([PARQUET],
-                    [parquet],
-                    [HAVE_PARQUET=yes],
-                    [HAVE_PARQUET=no])
-  PKG_CHECK_MODULES([PLASMA],
-                    [plasma],
-                    [HAVE_PLASMA=yes],
-                    [HAVE_PLASMA=no])
-else
-  USE_ARROW_BUILD_DIR=yes
-
-  ARROW_BUILD_DIR="${GARROW_ARROW_CPP_BUILD_DIR}"
-  AC_SUBST(ARROW_BUILD_DIR)
-
-  ARROW_SOURCE_INCLUDE_DIR="\$(abs_top_srcdir)/../cpp/src"
-  ARROW_BUILD_INCLUDE_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/src"
-  ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}"
-  AC_SUBST(ARROW_LIB_DIR)
-
-  ARROW_CFLAGS="-I${ARROW_BUILD_INCLUDE_DIR} -I${ARROW_SOURCE_INCLUDE_DIR}"
-  ARROW_LIBS="-L\$(ARROW_LIB_DIR) -larrow"
-  AC_SUBST(ARROW_CFLAGS)
-  AC_SUBST(ARROW_LIBS)
-
-  if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/adapters/orc/arrow-orc.pc"; then
-    HAVE_ARROW_ORC=yes
-  else
-    HAVE_ARROW_ORC=no
-  fi
-
-  ARROW_CUDA_CFLAGS="\$(ARROW_CFLAGS)"
-  if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-cuda.pc"; then
-    HAVE_ARROW_CUDA=yes
-    ARROW_CUDA_LIBS="-L\$(ARROW_LIB_DIR) -larrow_cuda -larrow"
-    ARROW_CUDA_PKG_CONFIG_PATH="\$(ARROW_BUILD_DIR)/src/arrow/gpu"
-  else
-    HAVE_ARROW_CUDA=no
-    ARROW_CUDA_LIBS=""
-    ARROW_CUDA_PKG_CONFIG_PATH=""
-  fi
-  AC_SUBST(ARROW_CUDA_CFLAGS)
-  AC_SUBST(ARROW_CUDA_LIBS)
-  AC_SUBST(ARROW_CUDA_PKG_CONFIG_PATH)
-
-  ARROW_DATASET_CFLAGS="\$(ARROW_CFLAGS)"
-  if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/dataset/arrow-dataset.pc"; then
-    HAVE_ARROW_DATASET=yes
-    ARROW_DATASET_LIBS="-L\$(ARROW_LIB_DIR) -larrow_dataset -lparquet -larrow"
-  else
-    HAVE_ARROW_DATASET=no
-    ARROW_DATASET_LIBS=""
-  fi
-  AC_SUBST(ARROW_DATASET_CFLAGS)
-  AC_SUBST(ARROW_DATASET_LIBS)
-
-  GANDIVA_CFLAGS="\$(ARROW_CFLAGS)"
-  if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/gandiva/gandiva.pc"; then
-    HAVE_GANDIVA=yes
-    GANDIVA_LIBS="-L\$(ARROW_LIB_DIR) -lgandiva -larrow"
-  else
-    HAVE_GANDIVA=no
-    GANDIVA_LIBS=""
-  fi
-  AC_SUBST(GANDIVA_CFLAGS)
-  AC_SUBST(GANDIVA_LIBS)
-
-  PARQUET_CFLAGS="\$(ARROW_CFLAGS)"
-  if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/parquet/parquet.pc"; then
-    HAVE_PARQUET=yes
-    PARQUET_LIBS="-L\$(ARROW_LIB_DIR) -lparquet -larrow"
-  else
-    HAVE_PARQUET=no
-    PARQUET_LIBS=""
-  fi
-  AC_SUBST(PARQUET_CFLAGS)
-  AC_SUBST(PARQUET_LIBS)
-
-  PLASMA_CFLAGS="\$(ARROW_CFLAGS)"
-  if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/plasma/plasma.pc"; then
-    HAVE_PLASMA=yes
-    PLASMA_LIBS="-L\$(ARROW_LIB_DIR) -lplasma -larrow"
-  else
-    HAVE_PLASMA=no
-    PLASMA_LIBS=""
-  fi
-  AC_SUBST(PLASMA_CFLAGS)
-  AC_SUBST(PLASMA_LIBS)
-fi
-
-AM_CONDITIONAL([USE_ARROW_BUILD_DIR],
-               [test "$USE_ARROW_BUILD_DIR" = "yes"])
-
-AM_CONDITIONAL([HAVE_ARROW_ORC], [test "$HAVE_ARROW_ORC" = "yes"])
-if test "$HAVE_ARROW_ORC" = "yes"; then
-  AC_DEFINE(HAVE_ARROW_ORC, [1], [Define to 1 if Apache Arrow supports ORC.])
-fi
-
-AM_CONDITIONAL([HAVE_ARROW_CUDA], [test "$HAVE_ARROW_CUDA" = "yes"])
-if test "$HAVE_ARROW_CUDA" = "yes"; then
-  ARROW_CUDA_GLIB_PACKAGE="arrow-cuda-glib"
-  PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":\$(abs_top_builddir)/arrow-cuda-glib"
-  if test -n "${ARROW_CUDA_PKG_CONFIG_PATH}"; then
-    PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":${ARROW_CUDA_PKG_CONFIG_PATH}${PLASMA_ARROW_CUDA_PKG_CONFIG_PATH}"
-  fi
-  AC_DEFINE(HAVE_ARROW_CUDA, [1], [Define to 1 if Apache Arrow supports CUDA.])
-else
-  ARROW_CUDA_GLIB_PACKAGE=""
-  PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=""
-fi
-AC_SUBST(ARROW_CUDA_GLIB_PACKAGE)
-AC_SUBST(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH)
-
-AM_CONDITIONAL([HAVE_ARROW_DATASET], [test "$HAVE_ARROW_DATASET" = "yes"])
-if test "$HAVE_ARROW_DATASET" = "yes"; then
-  AC_DEFINE(HAVE_ARROW_DATASET, [1], [Define to 1 if Apache Arrow Dataset exists.])
-fi
-
-AM_CONDITIONAL([HAVE_GANDIVA], [test "$HAVE_GANDIVA" = "yes"])
-if test "$HAVE_GANDIVA" = "yes"; then
-  AC_DEFINE(HAVE_GANDIVA, [1], [Define to 1 if Gandiva exists.])
-fi
-
-AM_CONDITIONAL([HAVE_PARQUET], [test "$HAVE_PARQUET" = "yes"])
-if test "$HAVE_PARQUET" = "yes"; then
-  AC_DEFINE(HAVE_PARQUET, [1], [Define to 1 if Apache Parquet exists.])
-fi
-
-AM_CONDITIONAL([HAVE_PLASMA], [test "$HAVE_PLASMA" = "yes"])
-if test "$HAVE_PLASMA" = "yes"; then
-  AC_DEFINE(HAVE_PLASMA, [1], [Define to 1 if Plasma exists.])
-fi
-
-exampledir="\$(datadir)/arrow-glib/example"
-AC_SUBST(exampledir)
-
-AC_CONFIG_FILES([
-  Makefile
-  arrow-cuda-glib/Makefile
-  arrow-cuda-glib/arrow-cuda-glib.pc
-  arrow-dataset-glib/Makefile
-  arrow-dataset-glib/arrow-dataset-glib.pc
-  arrow-glib/Makefile
-  arrow-glib/arrow-glib.pc
-  arrow-glib/arrow-orc-glib.pc
-  arrow-glib/version.h
-  gandiva-glib/Makefile
-  gandiva-glib/gandiva-glib.pc
-  gandiva-glib/version.h
-  parquet-glib/Makefile
-  parquet-glib/parquet-glib.pc
-  parquet-glib/version.h
-  plasma-glib/Makefile
-  plasma-glib/plasma-glib.pc
-  doc/Makefile
-  doc/arrow-dataset-glib/Makefile
-  doc/arrow-dataset-glib/entities.xml
-  doc/arrow-glib/Makefile
-  doc/arrow-glib/entities.xml
-  doc/gandiva-glib/Makefile
-  doc/gandiva-glib/entities.xml
-  doc/parquet-glib/Makefile
-  doc/parquet-glib/entities.xml
-  doc/plasma-glib/Makefile
-  doc/plasma-glib/entities.xml
-  example/Makefile
-  example/lua/Makefile
-])
-
-AC_OUTPUT
diff --git a/c_glib/doc/Makefile.am b/c_glib/doc/Makefile.am
deleted file mode 100644
index a56e0415f3d..00000000000
--- a/c_glib/doc/Makefile.am
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-SUBDIRS =					\
-	arrow-glib				\
-	arrow-dataset-glib			\
-	gandiva-glib				\
-	parquet-glib				\
-	plasma-glib
diff --git a/c_glib/doc/arrow-dataset-glib/Makefile.am b/c_glib/doc/arrow-dataset-glib/Makefile.am
deleted file mode 100644
index d1c636143ff..00000000000
--- a/c_glib/doc/arrow-dataset-glib/Makefile.am
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if HAVE_ARROW_DATASET
-DOC_MODULE = arrow-dataset-glib
-
-DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml
-
-DOC_SOURCE_DIR =				\
-	$(top_srcdir)/arrow-dataset-glib	\
-	$(top_builddir)/arrow-dataset-glib
-
-SCAN_OPTIONS =						\
-	--deprecated-guards="GARROW_DISABLE_DEPRECATED"
-
-MKDB_OPTIONS =					\
-	--name-space=gad			\
-	--source-suffixes="c,cpp,h"
-
-HFILE_GLOB =					\
-	$(top_srcdir)/arrow-dataset-glib/*.h
-
-IGNORE_HFILES =
-
-CFILE_GLOB =					\
-	$(top_srcdir)/arrow-dataset-glib/*.cpp
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_CFLAGS)				\
-	$(ARROW_DATASET_CFLAGS)
-
-GTKDOC_LIBS =								\
-	$(top_builddir)/arrow-glib/libarrow-glib.la			\
-	$(top_builddir)/arrow-dataset-glib/libarrow-dataset-glib.la
-
-include $(top_srcdir)/gtk-doc.make
-
-CLEANFILES +=					\
-	$(DOC_MODULE)-decl-list.txt		\
-	$(DOC_MODULE)-decl.txt			\
-	$(DOC_MODULE)-overrides.txt		\
-	$(DOC_MODULE)-sections.txt		\
-	$(DOC_MODULE).types
-else
-EXTRA_DIST =
-endif
-
-EXTRA_DIST +=					\
-	entities.xml.in				\
-	meson.build
diff --git a/c_glib/doc/arrow-glib/Makefile.am b/c_glib/doc/arrow-glib/Makefile.am
deleted file mode 100644
index db9f00f39f3..00000000000
--- a/c_glib/doc/arrow-glib/Makefile.am
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-DOC_MODULE = arrow-glib
-
-DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml
-
-DOC_SOURCE_DIR =				\
-	$(top_srcdir)/arrow-glib		\
-	$(top_builddir)/arrow-glib
-
-SCAN_OPTIONS =						\
-	--deprecated-guards="GARROW_DISABLE_DEPRECATED"
-
-MKDB_OPTIONS =					\
-	--name-space=garrow			\
-	--source-suffixes="c,cpp,h"
-
-HFILE_GLOB =					\
-	$(top_srcdir)/arrow-glib/*.h		\
-	$(top_builddir)/arrow-glib/*.h
-
-IGNORE_HFILES =
-
-if !HAVE_ARROW_ORC
-IGNORE_HFILES +=					\
-	$(top_srcdir)/arrow-glib/orc-file-reader.h
-endif
-
-CFILE_GLOB =					\
-	$(top_srcdir)/arrow-glib/*.cpp
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_CFLAGS)
-
-GTKDOC_LIBS =						\
-	$(top_builddir)/arrow-glib/libarrow-glib.la
-
-if HAVE_ARROW_CUDA
-DOC_SOURCE_DIR +=				\
-	$(top_srcdir)/arrow-cuda-glib
-HFILE_GLOB +=					\
-	$(top_srcdir)/arrow-cuda-glib/*.h
-CFILE_GLOB +=					\
-	$(top_srcdir)/arrow-cuda-glib/*.cpp
-GTKDOC_LIBS +=							\
-	$(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la
-endif
-
-include $(top_srcdir)/gtk-doc.make
-
-CLEANFILES +=					\
-	$(DOC_MODULE)-decl-list.txt		\
-	$(DOC_MODULE)-decl.txt			\
-	$(DOC_MODULE)-overrides.txt		\
-	$(DOC_MODULE)-sections.txt		\
-	$(DOC_MODULE).types
-
-EXTRA_DIST +=					\
-	entities.xml.in				\
-	meson.build
diff --git a/c_glib/doc/gandiva-glib/Makefile.am b/c_glib/doc/gandiva-glib/Makefile.am
deleted file mode 100644
index 16d333d0ae3..00000000000
--- a/c_glib/doc/gandiva-glib/Makefile.am
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if HAVE_GANDIVA
-DOC_MODULE = gandiva-glib
-
-DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml
-
-DOC_SOURCE_DIR =				\
-	$(top_srcdir)/gandiva-glib		\
-	$(top_builddir)/gandiva-glib
-
-SCAN_OPTIONS =						\
-	--deprecated-guards="GGANDIVA_DISABLE_DEPRECATED"
-
-MKDB_OPTIONS =					\
-	--name-space=ggandiva			\
-	--source-suffixes="c,cpp,h"
-
-HFILE_GLOB =					\
-	$(top_srcdir)/gandiva-glib/*.h
-
-IGNORE_HFILES =
-
-CFILE_GLOB =					\
-	$(top_srcdir)/gandiva-glib/*.cpp
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_CFLAGS)				\
-	$(GANDIVA_CFLAGS)
-
-GTKDOC_LIBS =						\
-	$(top_builddir)/arrow-glib/libarrow-glib.la	\
-	$(top_builddir)/gandiva-glib/libgandiva-glib.la
-
-include $(top_srcdir)/gtk-doc.make
-
-CLEANFILES +=					\
-	$(DOC_MODULE)-decl-list.txt		\
-	$(DOC_MODULE)-decl.txt			\
-	$(DOC_MODULE)-overrides.txt		\
-	$(DOC_MODULE)-sections.txt		\
-	$(DOC_MODULE).types
-else
-EXTRA_DIST =
-endif
-
-EXTRA_DIST +=					\
-	entities.xml.in				\
-	meson.build
diff --git a/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml b/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml
index c90f53780aa..182bbfb527e 100644
--- a/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml
+++ b/c_glib/doc/gandiva-glib/gandiva-glib-docs.xml
@@ -42,6 +42,14 @@
       <title>Expression</title>
       <xi:include href="xml/expression.xml"/>
     </chapter>
+    <chapter id="filter">
+      <title>Filter</title>
+      <xi:include href="xml/filter.xml"/>
+    </chapter>
+    <chapter id="selection-vector">
+      <title>Selection vector</title>
+      <xi:include href="xml/selection-vector.xml"/>
+    </chapter>
     <chapter id="projector">
       <title>Projector</title>
       <xi:include href="xml/projector.xml"/>
@@ -92,6 +100,10 @@
     <title>Index of deprecated API</title>
     <xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
   </index>
+  <index id="api-index-4-0-0" role="4.0.0">
+    <title>Index of new symbols in 4.0.0</title>
+    <xi:include href="xml/api-index-4.0.0.xml"><xi:fallback /></xi:include>
+  </index>
   <index id="api-index-1-0-0" role="1.0.0">
     <title>Index of new symbols in 1.0.0</title>
     <xi:include href="xml/api-index-1.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/doc/parquet-glib/Makefile.am b/c_glib/doc/parquet-glib/Makefile.am
deleted file mode 100644
index d125be1b54c..00000000000
--- a/c_glib/doc/parquet-glib/Makefile.am
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if HAVE_PARQUET
-DOC_MODULE = parquet-glib
-
-DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml
-
-DOC_SOURCE_DIR =				\
-	$(top_srcdir)/parquet-glib		\
-	$(top_builddir)/parquet-glib
-
-SCAN_OPTIONS =						\
-	--deprecated-guards="GPARQUET_DISABLE_DEPRECATED"
-
-MKDB_OPTIONS =					\
-	--name-space=gparquet			\
-	--source-suffixes="c,cpp,h"
-
-HFILE_GLOB =					\
-	$(top_srcdir)/parquet-glib/*.h
-
-IGNORE_HFILES =
-
-CFILE_GLOB =					\
-	$(top_srcdir)/parquet-glib/*.cpp
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_CFLAGS)				\
-	$(PARQUET_CFLAGS)
-
-GTKDOC_LIBS =						\
-	$(top_builddir)/parquet-glib/libparquet-glib.la	\
-	$(top_builddir)/arrow-glib/libarrow-glib.la
-
-include $(top_srcdir)/gtk-doc.make
-
-CLEANFILES +=					\
-	$(DOC_MODULE)-decl-list.txt		\
-	$(DOC_MODULE)-decl.txt			\
-	$(DOC_MODULE)-overrides.txt		\
-	$(DOC_MODULE)-sections.txt		\
-	$(DOC_MODULE).types
-else
-EXTRA_DIST =
-endif
-
-EXTRA_DIST +=					\
-	entities.xml.in				\
-	meson.build
diff --git a/c_glib/doc/plasma-glib/Makefile.am b/c_glib/doc/plasma-glib/Makefile.am
deleted file mode 100644
index df872d6ca31..00000000000
--- a/c_glib/doc/plasma-glib/Makefile.am
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-PLASMA_ARROW_CUDA_GTKDOC_LIBS =
-if HAVE_ARROW_CUDA
-PLASMA_ARROW_CUDA_GTKDOC_LIBS +=				\
-	$(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la
-endif
-
-if HAVE_PLASMA
-DOC_MODULE = plasma-glib
-
-DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml
-
-DOC_SOURCE_DIR =				\
-	$(top_srcdir)/plasma-glib		\
-	$(top_builddir)/plasma-glib
-
-SCAN_OPTIONS =						\
-	--deprecated-guards="GPLASMA_DISABLE_DEPRECATED"
-
-MKDB_OPTIONS =					\
-	--name-space=gplasma			\
-	--source-suffixes="c,cpp,h"
-
-HFILE_GLOB =					\
-	$(top_srcdir)/plasma-glib/*.h
-
-IGNORE_HFILES =
-
-CFILE_GLOB =					\
-	$(top_srcdir)/plasma-glib/*.cpp
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(ARROW_CFLAGS)				\
-	$(PLASMA_CFLAGS)
-
-GTKDOC_LIBS =						\
-	$(top_builddir)/arrow-glib/libarrow-glib.la	\
-	$(PLASMA_ARROW_CUDA_GTKDOC_LIBS)		\
-	$(top_builddir)/plasma-glib/libplasma-glib.la
-
-include $(top_srcdir)/gtk-doc.make
-
-CLEANFILES +=					\
-	$(DOC_MODULE)-decl-list.txt		\
-	$(DOC_MODULE)-decl.txt			\
-	$(DOC_MODULE)-overrides.txt		\
-	$(DOC_MODULE)-sections.txt		\
-	$(DOC_MODULE).types
-else
-EXTRA_DIST =
-endif
-
-EXTRA_DIST +=					\
-	entities.xml.in				\
-	meson.build
diff --git a/c_glib/example/Makefile.am b/c_glib/example/Makefile.am
deleted file mode 100644
index 9e460ecf8e0..00000000000
--- a/c_glib/example/Makefile.am
+++ /dev/null
@@ -1,64 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-SUBDIRS	=					\
-	lua
-
-EXTRA_DIST =					\
-	meson.build
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)				\
-	-DGARROW_DISABLE_DEPRECATED
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(GARROW_CFLAGS)
-
-AM_LDFLAGS =						\
-	$(GLIB_LIBS)					\
-	$(builddir)/../arrow-glib/libarrow-glib.la
-if USE_ARROW_BUILD_DIR
-AM_LDFLAGS +=					\
-	$(ARROW_LIBS)
-endif
-
-noinst_PROGRAMS =				\
-	build					\
-	extension-type				\
-	read-batch				\
-	read-stream
-
-build_SOURCES =					\
-	build.c
-
-extension_type_SOURCES =			\
-	extension-type.c
-
-read_batch_SOURCES =				\
-	read-batch.c
-
-read_stream_SOURCES =				\
-	read-stream.c
-
-dist_example_DATA =				\
-	README.md				\
-	$(build_SOURCES)			\
-	$(extension_type_SOURCES)		\
-	$(read_batch_SOURCES)			\
-	$(read_stream_SOURCES)
diff --git a/c_glib/example/lua/Makefile.am b/c_glib/example/lua/Makefile.am
deleted file mode 100644
index 84ddbc7607b..00000000000
--- a/c_glib/example/lua/Makefile.am
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-EXTRA_DIST =					\
-	meson.build
-
-lua_exampledir = $(exampledir)/lua
-dist_lua_example_DATA =				\
-	README.md				\
-	read-batch.lua				\
-	read-stream.lua				\
-	write-batch.lua				\
-	write-stream.lua
diff --git a/c_glib/gandiva-glib/Makefile.am b/c_glib/gandiva-glib/Makefile.am
deleted file mode 100644
index 5991abeab3a..00000000000
--- a/c_glib/gandiva-glib/Makefile.am
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CLEANFILES =
-DISTCLEANFILES =
-
-EXTRA_DIST =					\
-	meson.build
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(GARROW_CFLAGS)			\
-	$(GGANDIVA_CFLAGS)
-
-if HAVE_GANDIVA
-lib_LTLIBRARIES =				\
-	libgandiva-glib.la
-
-libgandiva_glib_la_CXXFLAGS =			\
-	$(GLIB_CFLAGS)				\
-	$(GANDIVA_CFLAGS)			\
-	$(GARROW_CFLAGS)			\
-	$(GGANDIVA_CFLAGS)
-
-libgandiva_glib_la_LDFLAGS =			\
-	-version-info $(LT_VERSION_INFO)	\
-	-no-undefined
-
-libgandiva_glib_la_LIBADD =			\
-	$(GLIB_LIBS)				\
-	../arrow-glib/libarrow-glib.la		\
-	$(GANDIVA_LIBS)
-
-libgandiva_glib_la_headers =			\
-	expression.h				\
-	function-registry.h			\
-	function-signature.h			\
-	gandiva-glib.h				\
-	native-function.h			\
-	node.h					\
-	projector.h
-
-libgandiva_glib_la_generated_headers =		\
-	enums.h					\
-	version.h
-
-libgandiva_glib_la_generated_sources =		\
-	enums.c					\
-	$(libgandiva_glib_la_generated_headers)
-
-libgandiva_glib_la_sources =			\
-	expression.cpp				\
-	function-registry.cpp			\
-	function-signature.cpp			\
-	node.cpp				\
-	native-function.cpp			\
-	projector.cpp				\
-	$(libgandiva_glib_la_headers)		\
-	$(libgandiva_glib_la_generated_sources)
-
-libgandiva_glib_la_cpp_headers =		\
-	expression.hpp				\
-	function-signature.hpp			\
-	gandiva-glib.hpp			\
-	native-function.hpp			\
-	node.hpp				\
-	projector.hpp
-
-libgandiva_glib_la_SOURCES =			\
-	$(libgandiva_glib_la_sources)		\
-	$(libgandiva_glib_la_cpp_headers)
-
-BUILT_SOURCES =					\
-	$(libgandiva_glib_la_generated_sources)	\
-	stamp-enums.c				\
-	stamp-enums.h
-
-DISTCLEANFILES +=				\
-	stamp-enums.c				\
-	stamp-enums.h
-
-EXTRA_DIST +=					\
-	enums.c.template			\
-	enums.h.template
-
-enums.h: stamp-enums.h
-	@true
-stamp-enums.h: $(libgandiva_glib_la_headers) enums.h.template
-	$(AM_V_GEN)					\
-	  (cd $(srcdir) &&				\
-	   $(GLIB_MKENUMS)				\
-	     --identifier-prefix GGandiva		\
-	     --symbol-prefix ggandiva			\
-	     --template enums.h.template		\
-	     $(libgandiva_glib_la_headers)) > enums.h
-	touch $@
-
-enums.c: stamp-enums.c
-	@true
-stamp-enums.c: $(libarrow_glib_la_headers) enums.c.template
-	$(AM_V_GEN)					\
-	  (cd $(srcdir) &&				\
-	   $(GLIB_MKENUMS)				\
-	     --identifier-prefix GGandiva		\
-	     --symbol-prefix ggandiva			\
-	     --template enums.c.template		\
-	     $(libgandiva_glib_la_headers)) > enums.c
-	touch $@
-
-gandiva_glib_includedir = $(includedir)/gandiva-glib
-gandiva_glib_include_HEADERS =			\
-	$(libgandiva_glib_la_headers)		\
-	$(libgandiva_glib_la_cpp_headers)	\
-	$(libgandiva_glib_la_generated_headers)
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA =				\
-	gandiva-glib.pc
-
-# GObject Introspection
-if HAVE_INTROSPECTION
--include $(INTROSPECTION_MAKEFILE)
-INTROSPECTION_GIRS =
-INTROSPECTION_SCANNER_ARGS =
-INTROSPECTION_SCANNER_ENV =
-if USE_ARROW_BUILD_DIR
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH}
-else
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$${PKG_CONFIG_PATH}
-endif
-INTROSPECTION_COMPILER_ARGS =				\
-	--includedir=$(abs_top_builddir)/arrow-glib
-
-Gandiva-1.0.gir: libgandiva-glib.la
-Gandiva_1_0_gir_PACKAGES =			\
-	arrow-glib
-Gandiva_1_0_gir_EXPORT_PACKAGES =		\
-	gandiva-glib
-Gandiva_1_0_gir_INCLUDES =			\
-	Arrow-1.0
-Gandiva_1_0_gir_CFLAGS =			\
-	$(AM_CPPFLAGS)
-Gandiva_1_0_gir_LIBS =
-Gandiva_1_0_gir_FILES = $(libgandiva_glib_la_sources)
-Gandiva_1_0_gir_SCANNERFLAGS =					\
-	--add-include-path=$(abs_top_builddir)/arrow-glib	\
-	--library-path=$(ARROW_LIB_DIR)				\
-	--warn-all						\
-	--identifier-prefix=GGandiva				\
-	--symbol-prefix=ggandiva
-if OS_MACOS
-Gandiva_1_0_gir_LIBS +=				\
-	arrow-glib				\
-	gandiva-glib
-Gandiva_1_0_gir_SCANNERFLAGS +=					\
-	--no-libtool						\
-	--library-path=$(abs_top_builddir)/arrow-glib/.libs	\
-	--library-path=$(abs_builddir)/.libs
-else
-Gandiva_1_0_gir_LIBS +=					\
-	$(abs_top_builddir)/arrow-glib/libarrow-glib.la	\
-	libgandiva-glib.la
-endif
-INTROSPECTION_GIRS += Gandiva-1.0.gir
-
-girdir = $(datadir)/gir-1.0
-gir_DATA = $(INTROSPECTION_GIRS)
-
-typelibdir = $(libdir)/girepository-1.0
-typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib)
-
-CLEANFILES +=					\
-	$(gir_DATA)				\
-	$(typelib_DATA)
-endif
-endif
diff --git a/c_glib/gandiva-glib/expression.cpp b/c_glib/gandiva-glib/expression.cpp
index d3c2f58dfdb..2ad98bfc007 100644
--- a/c_glib/gandiva-glib/expression.cpp
+++ b/c_glib/gandiva-glib/expression.cpp
@@ -36,6 +36,8 @@ G_BEGIN_DECLS
  * #GGandivaExpression is a class for an expression tree with a root node,
  * and a result field.
  *
+ * #GGandivaCondition is a class for an expression that returns boolean.
+ *
  * Since: 0.12.0
  */
 
@@ -217,6 +219,40 @@ ggandiva_expression_to_string(GGandivaExpression *expression)
   return g_strndup(string.data(), string.size());
 }
 
+
+G_DEFINE_TYPE(GGandivaCondition,
+              ggandiva_condition,
+              GGANDIVA_TYPE_EXPRESSION)
+
+static void
+ggandiva_condition_init(GGandivaCondition *object)
+{
+}
+
+static void
+ggandiva_condition_class_init(GGandivaConditionClass *klass)
+{
+}
+
+/**
+ * ggandiva_condition_new:
+ * @root_node: The root node for the condition.
+ *
+ * Returns: A newly created #GGandivaCondition.
+ *
+ * Since: 4.0.0
+ */
+GGandivaCondition *
+ggandiva_condition_new(GGandivaNode *root_node)
+{
+  auto gandiva_root_node = ggandiva_node_get_raw(root_node);
+  auto gandiva_condition =
+    gandiva::TreeExprBuilder::MakeCondition(gandiva_root_node);
+  return ggandiva_condition_new_raw(&gandiva_condition,
+                                    root_node);
+}
+
+
 G_END_DECLS
 
 GGandivaExpression *
@@ -238,3 +274,25 @@ ggandiva_expression_get_raw(GGandivaExpression *expression)
   auto priv = GGANDIVA_EXPRESSION_GET_PRIVATE(expression);
   return priv->expression;
 }
+
+
+GGandivaCondition *
+ggandiva_condition_new_raw(std::shared_ptr<gandiva::Condition> *gandiva_condition,
+                           GGandivaNode *root_node)
+{
+  auto arrow_result_field = (*gandiva_condition)->result();
+  auto result_field = garrow_field_new_raw(&arrow_result_field, nullptr);
+  auto condition = g_object_new(GGANDIVA_TYPE_CONDITION,
+                                "expression", gandiva_condition,
+                                "root-node", root_node,
+                                "result-field", result_field,
+                                NULL);
+  return GGANDIVA_CONDITION(condition);
+}
+
+std::shared_ptr<gandiva::Condition>
+ggandiva_condition_get_raw(GGandivaCondition *condition)
+{
+  return std::static_pointer_cast<gandiva::Condition>(
+    ggandiva_expression_get_raw(GGANDIVA_EXPRESSION(condition)));
+}
diff --git a/c_glib/gandiva-glib/expression.h b/c_glib/gandiva-glib/expression.h
index f86b6c504c2..0a720d9afbd 100644
--- a/c_glib/gandiva-glib/expression.h
+++ b/c_glib/gandiva-glib/expression.h
@@ -37,8 +37,27 @@ struct _GGandivaExpressionClass
   GObjectClass parent_class;
 };
 
-GGandivaExpression *ggandiva_expression_new(GGandivaNode *root_node,
-                                            GArrowField *result_field);
+GGandivaExpression *
+ggandiva_expression_new(GGandivaNode *root_node,
+                        GArrowField *result_field);
 gchar *ggandiva_expression_to_string(GGandivaExpression *expression);
 
+
+#define GGANDIVA_TYPE_CONDITION (ggandiva_condition_get_type())
+G_DECLARE_DERIVABLE_TYPE(GGandivaCondition,
+                         ggandiva_condition,
+                         GGANDIVA,
+                         CONDITION,
+                         GGandivaExpression)
+
+struct _GGandivaConditionClass
+{
+  GGandivaExpressionClass parent_class;
+};
+
+GGANDIVA_AVAILABLE_IN_4_0
+GGandivaCondition *
+ggandiva_condition_new(GGandivaNode *root_node);
+
+
 G_END_DECLS
diff --git a/c_glib/gandiva-glib/expression.hpp b/c_glib/gandiva-glib/expression.hpp
index a0d0e64c076..45b6593937f 100644
--- a/c_glib/gandiva-glib/expression.hpp
+++ b/c_glib/gandiva-glib/expression.hpp
@@ -26,8 +26,14 @@
 
 #include <gandiva-glib/expression.h>
 
-GGandivaExpression
-*ggandiva_expression_new_raw(std::shared_ptr<gandiva::Expression> *gandiva_expression,
-                             GGandivaNode *root_node,
-                             GArrowField *result_field);
+GGandivaExpression *
+ggandiva_expression_new_raw(std::shared_ptr<gandiva::Expression> *gandiva_expression,
+                            GGandivaNode *root_node,
+                            GArrowField *result_field);
 std::shared_ptr<gandiva::Expression> ggandiva_expression_get_raw(GGandivaExpression *expression);
+
+GGandivaCondition
+*ggandiva_condition_new_raw(std::shared_ptr<gandiva::Condition> *gandiva_expression,
+                            GGandivaNode *root_node);
+std::shared_ptr<gandiva::Condition>
+ggandiva_condition_get_raw(GGandivaCondition *condition);
diff --git a/c_glib/gandiva-glib/filter.cpp b/c_glib/gandiva-glib/filter.cpp
new file mode 100644
index 00000000000..34e04fcd30c
--- /dev/null
+++ b/c_glib/gandiva-glib/filter.cpp
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <limits>
+
+#include <arrow-glib/basic-array.hpp>
+#include <arrow-glib/error.hpp>
+#include <arrow-glib/record-batch.hpp>
+#include <arrow-glib/schema.hpp>
+
+#include <gandiva-glib/expression.hpp>
+#include <gandiva-glib/filter.hpp>
+#include <gandiva-glib/selection-vector.hpp>
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: filter
+ * @title: Filter classes
+ * @include: gandiva-glib/gandiva-glib.h
+ *
+ * #GGandivaFilter is a class for selecting records by a specific
+ * condition.
+ *
+ * Since: 4.0.0
+ */
+
+typedef struct GGandivaFilterPrivate_ {
+  std::shared_ptr<gandiva::Filter> filter;
+  GArrowSchema *schema;
+  GGandivaCondition *condition;
+} GGandivaFilterPrivate;
+
+enum {
+  PROP_FILTER = 1,
+  PROP_SCHEMA,
+  PROP_CONDITION,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GGandivaFilter,
+                           ggandiva_filter,
+                           G_TYPE_OBJECT)
+
+#define GGANDIVA_FILTER_GET_PRIVATE(obj)         \
+  static_cast<GGandivaFilterPrivate *>(          \
+     ggandiva_filter_get_instance_private(       \
+       GGANDIVA_FILTER(obj)))
+
+static void
+ggandiva_filter_dispose(GObject *object)
+{
+  auto priv = GGANDIVA_FILTER_GET_PRIVATE(object);
+
+  if (priv->schema) {
+    g_object_unref(priv->schema);
+    priv->schema = nullptr;
+  }
+
+  if (priv->condition) {
+    g_object_unref(priv->condition);
+    priv->condition = nullptr;
+  }
+
+  G_OBJECT_CLASS(ggandiva_filter_parent_class)->dispose(object);
+}
+
+static void
+ggandiva_filter_finalize(GObject *object)
+{
+  auto priv = GGANDIVA_FILTER_GET_PRIVATE(object);
+
+  priv->filter.~shared_ptr();
+
+  G_OBJECT_CLASS(ggandiva_filter_parent_class)->finalize(object);
+}
+
+static void
+ggandiva_filter_set_property(GObject *object,
+                             guint prop_id,
+                             const GValue *value,
+                             GParamSpec *pspec)
+{
+  auto priv = GGANDIVA_FILTER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_FILTER:
+    priv->filter =
+      *static_cast<std::shared_ptr<gandiva::Filter> *>(g_value_get_pointer(value));
+    break;
+  case PROP_SCHEMA:
+    priv->schema = GARROW_SCHEMA(g_value_dup_object(value));
+    break;
+  case PROP_CONDITION:
+    priv->condition = GGANDIVA_CONDITION(g_value_dup_object(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+ggandiva_filter_get_property(GObject *object,
+                             guint prop_id,
+                             GValue *value,
+                             GParamSpec *pspec)
+{
+  auto priv = GGANDIVA_FILTER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_SCHEMA:
+    g_value_set_object(value, priv->schema);
+    break;
+  case PROP_CONDITION:
+    g_value_set_object(value, priv->condition);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+ggandiva_filter_init(GGandivaFilter *object)
+{
+  auto priv = GGANDIVA_FILTER_GET_PRIVATE(object);
+  new(&priv->filter) std::shared_ptr<gandiva::Filter>;
+}
+
+static void
+ggandiva_filter_class_init(GGandivaFilterClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->dispose      = ggandiva_filter_dispose;
+  gobject_class->finalize     = ggandiva_filter_finalize;
+  gobject_class->set_property = ggandiva_filter_set_property;
+  gobject_class->get_property = ggandiva_filter_get_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer("filter",
+                              "Filter",
+                              "The raw std::shared<gandiva::Filter> *",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_FILTER, spec);
+
+  spec = g_param_spec_object("schema",
+                             "Schema",
+                             "The schema for input record batch",
+                             GARROW_TYPE_SCHEMA,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_SCHEMA, spec);
+
+  spec = g_param_spec_object("condition",
+                             "Condition",
+                             "The condition for the filter",
+                             GGANDIVA_TYPE_CONDITION,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_CONDITION, spec);
+}
+
+/**
+ * ggandiva_filter_new:
+ * @schema: A #GArrowSchema.
+ * @condition: The condition to be used.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (nullable): A newly created #GGandivaFilter on success,
+ *   %NULL on error.
+ *
+ * Since: 4.0.0
+ */
+GGandivaFilter *
+ggandiva_filter_new(GArrowSchema *schema,
+                    GGandivaCondition *condition,
+                    GError **error)
+{
+  auto arrow_schema = garrow_schema_get_raw(schema);
+  auto gandiva_condition = ggandiva_condition_get_raw(condition);
+  std::shared_ptr<gandiva::Filter> gandiva_filter;
+  auto status = gandiva::Filter::Make(arrow_schema,
+                                      gandiva_condition,
+                                      &gandiva_filter);
+  if (garrow_error_check(error, status, "[gandiva][filter][new]")) {
+    return ggandiva_filter_new_raw(&gandiva_filter, schema, condition);
+  } else {
+    return NULL;
+  }
+}
+
+/**
+ * ggandiva_filter_evaluate:
+ * @filter: A #GGandivaFilter.
+ * @record_batch: A #GArrowRecordBatch.
+ * @selection_vector: A #GGandivaSelectionVector that is used as
+ *   output.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE otherwise.
+ *
+ * Since: 4.0.0
+ */
+gboolean
+ggandiva_filter_evaluate(GGandivaFilter *filter,
+                         GArrowRecordBatch *record_batch,
+                         GGandivaSelectionVector *selection_vector,
+                         GError **error)
+{
+  auto gandiva_filter = ggandiva_filter_get_raw(filter);
+  auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
+  auto gandiva_selection_vector =
+    ggandiva_selection_vector_get_raw(selection_vector);
+  auto status = gandiva_filter->Evaluate(*arrow_record_batch,
+                                         gandiva_selection_vector);
+  return garrow_error_check(error, status, "[gandiva][filter][evaluate]");
+}
+
+G_END_DECLS
+
+GGandivaFilter *
+ggandiva_filter_new_raw(std::shared_ptr<gandiva::Filter> *gandiva_filter,
+                        GArrowSchema *schema,
+                        GGandivaCondition *condition)
+{
+  auto filter = g_object_new(GGANDIVA_TYPE_FILTER,
+                             "filter", gandiva_filter,
+                             "schema", schema,
+                             "condition", condition,
+                             NULL);
+  return GGANDIVA_FILTER(filter);
+}
+
+std::shared_ptr<gandiva::Filter>
+ggandiva_filter_get_raw(GGandivaFilter *filter)
+{
+  auto priv = GGANDIVA_FILTER_GET_PRIVATE(filter);
+  return priv->filter;
+}
diff --git a/c_glib/gandiva-glib/filter.h b/c_glib/gandiva-glib/filter.h
new file mode 100644
index 00000000000..9a0a5dc5d85
--- /dev/null
+++ b/c_glib/gandiva-glib/filter.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <gandiva-glib/expression.h>
+#include <gandiva-glib/selection-vector.h>
+
+G_BEGIN_DECLS
+
+#define GGANDIVA_TYPE_FILTER (ggandiva_filter_get_type())
+G_DECLARE_DERIVABLE_TYPE(GGandivaFilter,
+                         ggandiva_filter,
+                         GGANDIVA,
+                         FILTER,
+                         GObject)
+
+struct _GGandivaFilterClass
+{
+  GObjectClass parent_class;
+};
+
+GGandivaFilter *
+ggandiva_filter_new(GArrowSchema *schema,
+                    GGandivaCondition *condition,
+                    GError **error);
+gboolean
+ggandiva_filter_evaluate(GGandivaFilter *filter,
+                         GArrowRecordBatch *record_batch,
+                         GGandivaSelectionVector *selection_vector,
+                         GError **error);
+
+G_END_DECLS
diff --git a/c_glib/gandiva-glib/filter.hpp b/c_glib/gandiva-glib/filter.hpp
new file mode 100644
index 00000000000..a0bee9120a7
--- /dev/null
+++ b/c_glib/gandiva-glib/filter.hpp
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <gandiva/filter.h>
+
+#include <gandiva-glib/filter.h>
+
+GGandivaFilter *
+ggandiva_filter_new_raw(std::shared_ptr<gandiva::Filter> *gandiva_filter,
+                        GArrowSchema *schema,
+                        GGandivaCondition *condition);
+std::shared_ptr<gandiva::Filter>
+ggandiva_filter_get_raw(GGandivaFilter *filter);
diff --git a/c_glib/gandiva-glib/gandiva-glib.h b/c_glib/gandiva-glib/gandiva-glib.h
index 7d1c3d92696..9c1a1604d39 100644
--- a/c_glib/gandiva-glib/gandiva-glib.h
+++ b/c_glib/gandiva-glib/gandiva-glib.h
@@ -22,8 +22,10 @@
 #include <gandiva-glib/version.h>
 
 #include <gandiva-glib/expression.h>
+#include <gandiva-glib/filter.h>
 #include <gandiva-glib/function-registry.h>
 #include <gandiva-glib/function-signature.h>
 #include <gandiva-glib/native-function.h>
 #include <gandiva-glib/node.h>
 #include <gandiva-glib/projector.h>
+#include <gandiva-glib/selection-vector.h>
diff --git a/c_glib/gandiva-glib/gandiva-glib.hpp b/c_glib/gandiva-glib/gandiva-glib.hpp
index 8d857a3d8df..eb39f5838ee 100644
--- a/c_glib/gandiva-glib/gandiva-glib.hpp
+++ b/c_glib/gandiva-glib/gandiva-glib.hpp
@@ -22,5 +22,7 @@
 #include <gandiva-glib/gandiva-glib.h>
 
 #include <gandiva-glib/expression.hpp>
+#include <gandiva-glib/filter.hpp>
 #include <gandiva-glib/node.hpp>
 #include <gandiva-glib/projector.hpp>
+#include <gandiva-glib/selection-vector.hpp>
diff --git a/c_glib/gandiva-glib/gandiva-glib.pc.in b/c_glib/gandiva-glib/gandiva-glib.pc.in
deleted file mode 100644
index 7160f5ff422..00000000000
--- a/c_glib/gandiva-glib/gandiva-glib.pc.in
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Apache Arrow Gandiva GLib
-Description: C API for Apache Arrow Gandiva based on GLib
-Version: @VERSION@
-Libs: -L${libdir} -lgandiva-glib
-Cflags: -I${includedir}
-Requires: gandiva arrow-glib
diff --git a/c_glib/gandiva-glib/meson.build b/c_glib/gandiva-glib/meson.build
index d4ee81dd22b..5127d67afca 100644
--- a/c_glib/gandiva-glib/meson.build
+++ b/c_glib/gandiva-glib/meson.build
@@ -21,30 +21,36 @@ project_name = 'gandiva-glib'
 
 sources = files(
   'expression.cpp',
+  'filter.cpp',
   'function-registry.cpp',
   'function-signature.cpp',
-  'node.cpp',
   'native-function.cpp',
+  'node.cpp',
   'projector.cpp',
+  'selection-vector.cpp',
 )
 
 c_headers = files(
   'expression.h',
+  'filter.h',
   'function-registry.h',
   'function-signature.h',
   'gandiva-glib.h',
-  'node.h',
   'native-function.h',
+  'node.h',
   'projector.h',
+  'selection-vector.h',
 )
 
 cpp_headers = files(
   'expression.hpp',
+  'filter.hpp',
   'function-signature.hpp',
   'gandiva-glib.hpp',
-  'node.hpp',
   'native-function.hpp',
+  'node.hpp',
   'projector.hpp',
+  'selection-vector.hpp',
 )
 
 version_h_conf = configuration_data()
diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp
index 28106624494..68f65c6fa14 100644
--- a/c_glib/gandiva-glib/node.cpp
+++ b/c_glib/gandiva-glib/node.cpp
@@ -116,9 +116,9 @@ enum {
   PROP_RETURN_TYPE
 };
 
-G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode,
-                           ggandiva_node,
-                           G_TYPE_OBJECT)
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GGandivaNode,
+                                    ggandiva_node,
+                                    G_TYPE_OBJECT)
 
 #define GGANDIVA_NODE_GET_PRIVATE(object)                       \
   static_cast<GGandivaNodePrivate *>(                           \
diff --git a/c_glib/gandiva-glib/projector.cpp b/c_glib/gandiva-glib/projector.cpp
index ebcc4873c8d..c1cb19e2d2c 100644
--- a/c_glib/gandiva-glib/projector.cpp
+++ b/c_glib/gandiva-glib/projector.cpp
@@ -22,12 +22,13 @@
 #endif
 
 #include <arrow-glib/basic-array.hpp>
+#include <arrow-glib/error.hpp>
 #include <arrow-glib/record-batch.hpp>
 #include <arrow-glib/schema.hpp>
 
-#include <arrow-glib/error.hpp>
 #include <gandiva-glib/expression.hpp>
 #include <gandiva-glib/projector.hpp>
+#include <gandiva-glib/selection-vector.hpp>
 
 G_BEGIN_DECLS
 
@@ -36,19 +37,25 @@ G_BEGIN_DECLS
  * @title: Projector classes
  * @include: gandiva-glib/gandiva-glib.h
  *
- * #GGandivaProjector is a class for building a specific schema
- * and vector of expressions.
+ * #GGandivaProjector is a class that evaluates given expressions
+ * against the given record batches.
+ *
+ * #GGandivaSelectableProjector is a class that evaluates given expressions
+ * against the given selected records in the given record batches.
  *
  * Since: 0.12.0
  */
 
 typedef struct GGandivaProjectorPrivate_ {
   std::shared_ptr<gandiva::Projector> projector;
+  GArrowSchema *schema;
+  GList *expressions;
 } GGandivaProjectorPrivate;
 
 enum {
-  PROP_0,
-  PROP_PROJECTOR
+  PROP_PROJECTOR = 1,
+  PROP_SCHEMA,
+  PROP_EXPRESSIONS,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GGandivaProjector,
@@ -60,6 +67,22 @@ G_DEFINE_TYPE_WITH_PRIVATE(GGandivaProjector,
      ggandiva_projector_get_instance_private(       \
        GGANDIVA_PROJECTOR(obj)))
 
+static void
+ggandiva_projector_dispose(GObject *object)
+{
+  auto priv = GGANDIVA_PROJECTOR_GET_PRIVATE(object);
+
+  if (priv->schema) {
+    g_object_unref(G_OBJECT(priv->schema));
+    priv->schema = nullptr;
+  }
+
+  g_list_free_full(priv->expressions, g_object_unref);
+  priv->expressions = nullptr;
+
+  G_OBJECT_CLASS(ggandiva_projector_parent_class)->dispose(object);
+}
+
 static void
 ggandiva_projector_finalize(GObject *object)
 {
@@ -83,6 +106,33 @@ ggandiva_projector_set_property(GObject *object,
     priv->projector =
       *static_cast<std::shared_ptr<gandiva::Projector> *>(g_value_get_pointer(value));
     break;
+  case PROP_SCHEMA:
+    priv->schema = GARROW_SCHEMA(g_value_dup_object(value));
+    break;
+  case PROP_EXPRESSIONS:
+    priv->expressions =
+      g_list_copy_deep(static_cast<GList *>(g_value_get_pointer(value)),
+                       reinterpret_cast<GCopyFunc>(g_object_ref),
+                       nullptr);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+ggandiva_projector_get_property(GObject *object,
+                                guint prop_id,
+                                GValue *value,
+                                GParamSpec *pspec)
+{
+  auto priv = GGANDIVA_PROJECTOR_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_SCHEMA:
+    g_value_set_object(value, priv->schema);
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -99,19 +149,35 @@ ggandiva_projector_init(GGandivaProjector *object)
 static void
 ggandiva_projector_class_init(GGandivaProjectorClass *klass)
 {
-  GParamSpec *spec;
-
   auto gobject_class = G_OBJECT_CLASS(klass);
 
+  gobject_class->dispose      = ggandiva_projector_dispose;
   gobject_class->finalize     = ggandiva_projector_finalize;
   gobject_class->set_property = ggandiva_projector_set_property;
+  gobject_class->get_property = ggandiva_projector_get_property;
 
+  GParamSpec *spec;
   spec = g_param_spec_pointer("projector",
                               "Projector",
                               "The raw std::shared<gandiva::Projector> *",
                               static_cast<GParamFlags>(G_PARAM_WRITABLE |
                                                        G_PARAM_CONSTRUCT_ONLY));
   g_object_class_install_property(gobject_class, PROP_PROJECTOR, spec);
+
+  spec = g_param_spec_object("schema",
+                             "Schema",
+                             "The schema of the projector",
+                             GARROW_TYPE_SCHEMA,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_SCHEMA, spec);
+
+  spec = g_param_spec_pointer("expressions",
+                              "Expressions",
+                              "The expressions for the projector",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_EXPRESSIONS, spec);
 }
 
 /**
@@ -143,7 +209,9 @@ ggandiva_projector_new(GArrowSchema *schema,
                             gandiva_expressions,
                             &gandiva_projector);
   if (garrow_error_check(error, status, "[gandiva][projector][new]")) {
-    return ggandiva_projector_new_raw(&gandiva_projector);
+    return ggandiva_projector_new_raw(&gandiva_projector,
+                                      schema,
+                                      expressions);
   } else {
     return NULL;
   }
@@ -185,17 +253,140 @@ ggandiva_projector_evaluate(GGandivaProjector *projector,
   }
 }
 
+
+G_DEFINE_TYPE(GGandivaSelectableProjector,
+              ggandiva_selectable_projector,
+              GGANDIVA_TYPE_PROJECTOR)
+
+static void
+ggandiva_selectable_projector_init(GGandivaSelectableProjector *object)
+{
+}
+
+static void
+ggandiva_selectable_projector_class_init(GGandivaSelectableProjectorClass *klass)
+{
+}
+
+/**
+ * ggandiva_selectable_projector_new:
+ * @schema: A #GArrowSchema.
+ * @expressions: (element-type GGandivaExpression): The built expressions.
+ * @mode: A #GGandivaSelectionVectorMode to be used.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (nullable): A newly created #GGandivaProjector on success,
+ *   %NULL on error.
+ *
+ * Since: 4.0.0
+ */
+GGandivaSelectableProjector *
+ggandiva_selectable_projector_new(GArrowSchema *schema,
+                                  GList *expressions,
+                                  GGandivaSelectionVectorMode mode,
+                                  GError **error)
+{
+  auto arrow_schema = garrow_schema_get_raw(schema);
+  std::vector<std::shared_ptr<gandiva::Expression>> gandiva_expressions;
+  for (auto node = expressions; node; node = g_list_next(node)) {
+    auto expression = GGANDIVA_EXPRESSION(node->data);
+    auto gandiva_expression = ggandiva_expression_get_raw(expression);
+    gandiva_expressions.push_back(gandiva_expression);
+  }
+  auto gandiva_mode = static_cast<gandiva::SelectionVector::Mode>(mode);
+  auto gandiva_configuration =
+    gandiva::ConfigurationBuilder::DefaultConfiguration();
+  std::shared_ptr<gandiva::Projector> gandiva_projector;
+  auto status = gandiva_projector->Make(arrow_schema,
+                                        gandiva_expressions,
+                                        gandiva_mode,
+                                        gandiva_configuration,
+                                        &gandiva_projector);
+  if (garrow_error_check(error,
+                         status,
+                         "[gandiva][selectable-projector][new]")) {
+    return ggandiva_selectable_projector_new_raw(&gandiva_projector,
+                                                 schema,
+                                                 expressions);
+  } else {
+    return NULL;
+  }
+}
+
+/**
+ * ggandiva_selectable_projector_evaluate:
+ * @projector: A #GGandivaSelectableProjector.
+ * @record_batch: A #GArrowRecordBatch.
+ * @selection_vector: A #GGandivaSelectionVector that specifies
+ *   the filtered row positions.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (element-type GArrowArray) (nullable) (transfer full):
+ *   The #GArrowArray as the result evaluated on success, %NULL on error.
+ *
+ * Since: 4.0.0
+ */
+GList *
+ggandiva_selectable_projector_evaluate(
+  GGandivaSelectableProjector *projector,
+  GArrowRecordBatch *record_batch,
+  GGandivaSelectionVector *selection_vector,
+  GError **error)
+{
+  auto gandiva_projector =
+    ggandiva_projector_get_raw(GGANDIVA_PROJECTOR(projector));
+  auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
+  auto gandiva_selection_vector =
+    ggandiva_selection_vector_get_raw(selection_vector).get();
+  auto memory_pool = arrow::default_memory_pool();
+  arrow::ArrayVector arrow_arrays;
+  auto status =
+    gandiva_projector->Evaluate(*arrow_record_batch,
+                                gandiva_selection_vector,
+                                memory_pool,
+                                &arrow_arrays);
+  if (garrow_error_check(error,
+                         status,
+                         "[gandiva][selectable-projector][evaluate]")) {
+    GList *arrays = NULL;
+    for (auto arrow_array : arrow_arrays) {
+      auto array = garrow_array_new_raw(&arrow_array);
+      arrays = g_list_prepend(arrays, array);
+    }
+    return g_list_reverse(arrays);
+  } else {
+    return NULL;
+  }
+}
+
 G_END_DECLS
 
 GGandivaProjector *
-ggandiva_projector_new_raw(std::shared_ptr<gandiva::Projector> *gandiva_projector)
+ggandiva_projector_new_raw(
+  std::shared_ptr<gandiva::Projector> *gandiva_projector,
+  GArrowSchema *schema,
+  GList *expressions)
 {
   auto projector = g_object_new(GGANDIVA_TYPE_PROJECTOR,
                                 "projector", gandiva_projector,
+                                "schema", schema,
+                                "expressions", expressions,
                                 NULL);
   return GGANDIVA_PROJECTOR(projector);
 }
 
+GGandivaSelectableProjector *
+ggandiva_selectable_projector_new_raw(
+  std::shared_ptr<gandiva::Projector> *gandiva_projector,
+  GArrowSchema *schema,
+  GList *expressions)
+{
+  auto projector = g_object_new(GGANDIVA_TYPE_SELECTABLE_PROJECTOR,
+                                "projector", gandiva_projector,
+                                NULL);
+  return GGANDIVA_SELECTABLE_PROJECTOR(projector);
+}
+
 std::shared_ptr<gandiva::Projector>
 ggandiva_projector_get_raw(GGandivaProjector *projector)
 {
diff --git a/c_glib/gandiva-glib/projector.h b/c_glib/gandiva-glib/projector.h
index ae6dead9521..5dd218b808c 100644
--- a/c_glib/gandiva-glib/projector.h
+++ b/c_glib/gandiva-glib/projector.h
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <arrow-glib/arrow-glib.h>
+#include <gandiva-glib/selection-vector.h>
 
 G_BEGIN_DECLS
 
@@ -35,11 +35,41 @@ struct _GGandivaProjectorClass
   GObjectClass parent_class;
 };
 
-GGandivaProjector *ggandiva_projector_new(GArrowSchema *schema,
-                                          GList *expressions,
-                                          GError **error);
-GList *ggandiva_projector_evaluate(GGandivaProjector *projector,
-                                   GArrowRecordBatch *record_batch,
-                                   GError **error);
+GGandivaProjector *
+ggandiva_projector_new(GArrowSchema *schema,
+                       GList *expressions,
+                       GError **error);
+GList *
+ggandiva_projector_evaluate(GGandivaProjector *projector,
+                            GArrowRecordBatch *record_batch,
+                            GError **error);
+
+
+#define GGANDIVA_TYPE_SELECTABLE_PROJECTOR      \
+  (ggandiva_selectable_projector_get_type())
+G_DECLARE_DERIVABLE_TYPE(GGandivaSelectableProjector,
+                         ggandiva_selectable_projector,
+                         GGANDIVA,
+                         SELECTABLE_PROJECTOR,
+                         GGandivaProjector)
+
+struct _GGandivaSelectableProjectorClass
+{
+  GGandivaProjectorClass parent_class;
+};
+
+GGANDIVA_AVAILABLE_IN_4_0
+GGandivaSelectableProjector *
+ggandiva_selectable_projector_new(GArrowSchema *schema,
+                                  GList *expressions,
+                                  GGandivaSelectionVectorMode mode,
+                                  GError **error);
+GGANDIVA_AVAILABLE_IN_4_0
+GList *
+ggandiva_selectable_projector_evaluate(GGandivaSelectableProjector *projector,
+                                       GArrowRecordBatch *record_batch,
+                                       GGandivaSelectionVector *selection_vector,
+                                       GError **error);
+
 
 G_END_DECLS
diff --git a/c_glib/gandiva-glib/projector.hpp b/c_glib/gandiva-glib/projector.hpp
index 1e9359b3342..b372f32f598 100644
--- a/c_glib/gandiva-glib/projector.hpp
+++ b/c_glib/gandiva-glib/projector.hpp
@@ -25,5 +25,15 @@
 
 #include <gandiva-glib/projector.h>
 
-GGandivaProjector *ggandiva_projector_new_raw(std::shared_ptr<gandiva::Projector> *gandiva_projector);
-std::shared_ptr<gandiva::Projector> ggandiva_projector_get_raw(GGandivaProjector *projector);
+GGandivaProjector *
+ggandiva_projector_new_raw(
+  std::shared_ptr<gandiva::Projector> *gandiva_projector,
+  GArrowSchema *schema,
+  GList *expressions);
+GGandivaSelectableProjector *
+ggandiva_selectable_projector_new_raw(
+  std::shared_ptr<gandiva::Projector> *gandiva_projector,
+  GArrowSchema *schema,
+  GList *expressions);
+std::shared_ptr<gandiva::Projector>
+ggandiva_projector_get_raw(GGandivaProjector *projector);
diff --git a/c_glib/gandiva-glib/selection-vector.cpp b/c_glib/gandiva-glib/selection-vector.cpp
new file mode 100644
index 00000000000..1c1fa0448fa
--- /dev/null
+++ b/c_glib/gandiva-glib/selection-vector.cpp
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <arrow-glib/basic-array.hpp>
+#include <arrow-glib/error.hpp>
+
+#include <gandiva-glib/selection-vector.hpp>
+
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: selection-vector
+ * @section_id: selection-vector-classes
+ * @title: Selection vector classes
+ * @include: gandiva-glib/gandiva-glib.h
+ *
+ * #GGandivaSelectionVector is a base class for a selection vector.
+ *
+ * #GGandivaUInt16SelectionVector is a class for a selection vector
+ * that uses 16-bit unsigned integer for each index.
+ *
+ * #GGandivaUInt32SelectionVector is a class for a selection vector
+ * that uses 32-bit unsigned integer for each index.
+ *
+ * #GGandivaUInt64SelectionVector is a class for a selection vector
+ * that uses 64-bit unsigned integer for each index.
+ *
+ * Since: 4.0.0
+ */
+
+typedef struct GGandivaSelectionVectorPrivate_ {
+  std::shared_ptr<gandiva::SelectionVector> selection_vector;
+} GGandivaSelectionVectorPrivate;
+
+enum {
+  PROP_SELECTION_VECTOR = 1,
+};
+
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GGandivaSelectionVector,
+                                    ggandiva_selection_vector,
+                                    G_TYPE_OBJECT)
+
+#define GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object)           \
+  static_cast<GGandivaSelectionVectorPrivate *>(                \
+    ggandiva_selection_vector_get_instance_private(             \
+      GGANDIVA_SELECTION_VECTOR(object)))
+
+static void
+ggandiva_selection_vector_finalize(GObject *object)
+{
+  auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object);
+
+  priv->selection_vector.~shared_ptr();
+
+  G_OBJECT_CLASS(ggandiva_selection_vector_parent_class)->finalize(object);
+}
+
+static void
+ggandiva_selection_vector_set_property(GObject *object,
+                                       guint prop_id,
+                                       const GValue *value,
+                                       GParamSpec *pspec)
+{
+  auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_SELECTION_VECTOR:
+    priv->selection_vector =
+      *static_cast<std::shared_ptr<gandiva::SelectionVector> *>(
+        g_value_get_pointer(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+ggandiva_selection_vector_init(GGandivaSelectionVector *object)
+{
+  auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(object);
+  new(&priv->selection_vector) std::shared_ptr<gandiva::SelectionVector>;
+}
+
+static void
+ggandiva_selection_vector_class_init(GGandivaSelectionVectorClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize     = ggandiva_selection_vector_finalize;
+  gobject_class->set_property = ggandiva_selection_vector_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer("selection-vector",
+                              "Selection vector",
+                              "The raw std::shared<gandiva::SelectionVector> *",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_SELECTION_VECTOR, spec);
+}
+
+/**
+ * ggandiva_selection_vector_get_mode:
+ * @selection_vector: A #GGandivaSelectionVector.
+ *
+ * Returns: A #GGandivaSelectionVectorMode for the selection vector.
+ *
+ * Since: 4.0.0
+ */
+GGandivaSelectionVectorMode
+ggandiva_selection_vector_get_mode(GGandivaSelectionVector *selection_vector)
+{
+  auto gandiva_selection_vector =
+    ggandiva_selection_vector_get_raw(selection_vector);
+  auto gandiva_mode = gandiva_selection_vector->GetMode();
+  return static_cast<GGandivaSelectionVectorMode>(gandiva_mode);
+}
+
+/**
+ * ggandiva_selection_vector_to_array:
+ * @selection_vector: A #GGandivaSelectionVector.
+ *
+ * Returns: (transfer full): A #GArrowArray that has the same content
+ *   of the selection vector.
+ *
+ * Since: 4.0.0
+ */
+GArrowArray *
+ggandiva_selection_vector_to_array(GGandivaSelectionVector *selection_vector)
+{
+  auto gandiva_selection_vector =
+    ggandiva_selection_vector_get_raw(selection_vector);
+  auto arrow_array = gandiva_selection_vector->ToArray();
+  return garrow_array_new_raw(&arrow_array);
+}
+
+
+G_DEFINE_TYPE(GGandivaUInt16SelectionVector,
+              ggandiva_uint16_selection_vector,
+              GGANDIVA_TYPE_SELECTION_VECTOR)
+
+static void
+ggandiva_uint16_selection_vector_init(
+  GGandivaUInt16SelectionVector *selection_vector)
+{
+}
+
+static void
+ggandiva_uint16_selection_vector_class_init(
+  GGandivaUInt16SelectionVectorClass *klass)
+{
+}
+
+/**
+ * ggandiva_uint16_selection_vector_new:
+ * @max_slots: The max number of slots.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: A newly created #GGandivaUInt16SelectionVector.
+ *
+ * Since: 4.0.0
+ */
+GGandivaUInt16SelectionVector *
+ggandiva_uint16_selection_vector_new(gint64 max_slots,
+                                     GError **error)
+{
+  auto memory_pool = arrow::default_memory_pool();
+  std::shared_ptr<gandiva::SelectionVector> gandiva_selection_vector;
+  auto status = gandiva::SelectionVector::MakeInt16(max_slots,
+                                                    memory_pool,
+                                                    &gandiva_selection_vector);
+  if (garrow_error_check(error,
+                         status,
+                         "[gandiva][uint16-selection-vector][new]")) {
+    return GGANDIVA_UINT16_SELECTION_VECTOR(
+      ggandiva_selection_vector_new_raw(&gandiva_selection_vector));
+  } else {
+    return NULL;
+  }
+}
+
+
+G_DEFINE_TYPE(GGandivaUInt32SelectionVector,
+              ggandiva_uint32_selection_vector,
+              GGANDIVA_TYPE_SELECTION_VECTOR)
+
+static void
+ggandiva_uint32_selection_vector_init(
+  GGandivaUInt32SelectionVector *selection_vector)
+{
+}
+
+static void
+ggandiva_uint32_selection_vector_class_init(
+  GGandivaUInt32SelectionVectorClass *klass)
+{
+}
+
+/**
+ * ggandiva_uint32_selection_vector_new:
+ * @max_slots: The max number of slots.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: A newly created #GGandivaUInt32SelectionVector.
+ *
+ * Since: 4.0.0
+ */
+GGandivaUInt32SelectionVector *
+ggandiva_uint32_selection_vector_new(gint64 max_slots,
+                                     GError **error)
+{
+  auto memory_pool = arrow::default_memory_pool();
+  std::shared_ptr<gandiva::SelectionVector> gandiva_selection_vector;
+  auto status = gandiva::SelectionVector::MakeInt32(max_slots,
+                                                    memory_pool,
+                                                    &gandiva_selection_vector);
+  if (garrow_error_check(error,
+                         status,
+                         "[gandiva][uint32-selection-vector][new]")) {
+    return GGANDIVA_UINT32_SELECTION_VECTOR(
+      ggandiva_selection_vector_new_raw(&gandiva_selection_vector));
+  } else {
+    return NULL;
+  }
+}
+
+
+G_DEFINE_TYPE(GGandivaUInt64SelectionVector,
+              ggandiva_uint64_selection_vector,
+              GGANDIVA_TYPE_SELECTION_VECTOR)
+
+static void
+ggandiva_uint64_selection_vector_init(
+  GGandivaUInt64SelectionVector *selection_vector)
+{
+}
+
+static void
+ggandiva_uint64_selection_vector_class_init(
+  GGandivaUInt64SelectionVectorClass *klass)
+{
+}
+
+/**
+ * ggandiva_uint64_selection_vector_new:
+ * @max_slots: The max number of slots.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: A newly created #GGandivaUInt64SelectionVector.
+ *
+ * Since: 4.0.0
+ */
+GGandivaUInt64SelectionVector *
+ggandiva_uint64_selection_vector_new(gint64 max_slots,
+                                     GError **error)
+{
+  auto memory_pool = arrow::default_memory_pool();
+  std::shared_ptr<gandiva::SelectionVector> gandiva_selection_vector;
+  auto status = gandiva::SelectionVector::MakeInt64(max_slots,
+                                                    memory_pool,
+                                                    &gandiva_selection_vector);
+  if (garrow_error_check(error,
+                         status,
+                         "[gandiva][uint64-selection-vector][new]")) {
+    return GGANDIVA_UINT64_SELECTION_VECTOR(
+      ggandiva_selection_vector_new_raw(&gandiva_selection_vector));
+  } else {
+    return NULL;
+  }
+}
+
+
+G_END_DECLS
+
+
+GGandivaSelectionVector *
+ggandiva_selection_vector_new_raw(
+  std::shared_ptr<gandiva::SelectionVector> *gandiva_selection_vector)
+{
+  GType type = GGANDIVA_TYPE_SELECTION_VECTOR;
+  switch ((*gandiva_selection_vector)->GetMode()) {
+  case gandiva::SelectionVector::Mode::MODE_UINT16:
+    type = GGANDIVA_TYPE_UINT16_SELECTION_VECTOR;
+    break;
+  case gandiva::SelectionVector::Mode::MODE_UINT32:
+    type = GGANDIVA_TYPE_UINT32_SELECTION_VECTOR;
+    break;
+  case gandiva::SelectionVector::Mode::MODE_UINT64:
+    type = GGANDIVA_TYPE_UINT64_SELECTION_VECTOR;
+    break;
+  default:
+    break;
+  }
+  auto selection_vector =
+    g_object_new(type,
+                 "selection-vector", gandiva_selection_vector,
+                 NULL);
+  return GGANDIVA_SELECTION_VECTOR(selection_vector);
+}
+
+std::shared_ptr<gandiva::SelectionVector>
+ggandiva_selection_vector_get_raw(GGandivaSelectionVector *selection_vector)
+{
+  auto priv = GGANDIVA_SELECTION_VECTOR_GET_PRIVATE(selection_vector);
+  return priv->selection_vector;
+}
diff --git a/c_glib/gandiva-glib/selection-vector.h b/c_glib/gandiva-glib/selection-vector.h
new file mode 100644
index 00000000000..029c4cde5ca
--- /dev/null
+++ b/c_glib/gandiva-glib/selection-vector.h
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow-glib/arrow-glib.h>
+
+#include <gandiva-glib/version.h>
+
+G_BEGIN_DECLS
+
+/**
+ * GGandivaSelectionVectorMode:
+ * @GGANDIVA_SELECTION_VECTOR_MODE_NONE: Selection vector isn't used.
+ * @GGANDIVA_SELECTION_VECTOR_MODE_UINT16:
+ *   #GGandivaUInt16SelectionVector is used.
+ * @GGANDIVA_SELECTION_VECTOR_MODE_UINT32:
+ *   #GGandivaUInt32SelectionVector is used.
+ * @GGANDIVA_SELECTION_VECTOR_MODE_UINT64:
+ *   #GGandivaUInt64SelectionVector is used.
+ *
+ * They are corresponding to `gandiva::SelectionVector::Mode` values.
+ *
+ * Since: 4.0.0
+ */
+typedef enum {
+  GGANDIVA_SELECTION_VECTOR_MODE_NONE,
+  GGANDIVA_SELECTION_VECTOR_MODE_UINT16,
+  GGANDIVA_SELECTION_VECTOR_MODE_UINT32,
+  GGANDIVA_SELECTION_VECTOR_MODE_UINT64,
+} GGandivaSelectionVectorMode;
+
+
+#define GGANDIVA_TYPE_SELECTION_VECTOR (ggandiva_selection_vector_get_type())
+G_DECLARE_DERIVABLE_TYPE(GGandivaSelectionVector,
+                         ggandiva_selection_vector,
+                         GGANDIVA,
+                         SELECTION_VECTOR,
+                         GObject)
+
+struct _GGandivaSelectionVectorClass
+{
+  GObjectClass parent_class;
+};
+
+GGANDIVA_AVAILABLE_IN_4_0
+GGandivaSelectionVectorMode
+ggandiva_selection_vector_get_mode(GGandivaSelectionVector *selection_vector);
+
+GGANDIVA_AVAILABLE_IN_4_0
+GArrowArray *
+ggandiva_selection_vector_to_array(GGandivaSelectionVector *selection_vector);
+
+
+#define GGANDIVA_TYPE_UINT16_SELECTION_VECTOR   \
+  (ggandiva_uint16_selection_vector_get_type())
+G_DECLARE_DERIVABLE_TYPE(GGandivaUInt16SelectionVector,
+                         ggandiva_uint16_selection_vector,
+                         GGANDIVA,
+                         UINT16_SELECTION_VECTOR,
+                         GGandivaSelectionVector)
+
+struct _GGandivaUInt16SelectionVectorClass
+{
+  GGandivaSelectionVectorClass parent_class;
+};
+
+GGANDIVA_AVAILABLE_IN_4_0
+GGandivaUInt16SelectionVector *
+ggandiva_uint16_selection_vector_new(gint64 max_slots,
+                                     GError **error);
+
+
+#define GGANDIVA_TYPE_UINT32_SELECTION_VECTOR   \
+  (ggandiva_uint32_selection_vector_get_type())
+G_DECLARE_DERIVABLE_TYPE(GGandivaUInt32SelectionVector,
+                         ggandiva_uint32_selection_vector,
+                         GGANDIVA,
+                         UINT32_SELECTION_VECTOR,
+                         GGandivaSelectionVector)
+
+struct _GGandivaUInt32SelectionVectorClass
+{
+  GGandivaSelectionVectorClass parent_class;
+};
+
+GGANDIVA_AVAILABLE_IN_4_0
+GGandivaUInt32SelectionVector *
+ggandiva_uint32_selection_vector_new(gint64 max_slots,
+                                     GError **error);
+
+
+#define GGANDIVA_TYPE_UINT64_SELECTION_VECTOR   \
+  (ggandiva_uint64_selection_vector_get_type())
+G_DECLARE_DERIVABLE_TYPE(GGandivaUInt64SelectionVector,
+                         ggandiva_uint64_selection_vector,
+                         GGANDIVA,
+                         UINT64_SELECTION_VECTOR,
+                         GGandivaSelectionVector)
+
+struct _GGandivaUInt64SelectionVectorClass
+{
+  GGandivaSelectionVectorClass parent_class;
+};
+
+GGANDIVA_AVAILABLE_IN_4_0
+GGandivaUInt64SelectionVector *
+ggandiva_uint64_selection_vector_new(gint64 max_slots,
+                                     GError **error);
+
+
+G_END_DECLS
diff --git a/c_glib/gandiva-glib/selection-vector.hpp b/c_glib/gandiva-glib/selection-vector.hpp
new file mode 100644
index 00000000000..aec583141e9
--- /dev/null
+++ b/c_glib/gandiva-glib/selection-vector.hpp
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <gandiva/selection_vector.h>
+
+#include <gandiva-glib/selection-vector.h>
+
+GGandivaSelectionVector *
+ggandiva_selection_vector_new_raw(
+  std::shared_ptr<gandiva::SelectionVector> *gandiva_selection_vector);
+std::shared_ptr<gandiva::SelectionVector>
+ggandiva_selection_vector_get_raw(GGandivaSelectionVector *selection_vector);
diff --git a/c_glib/gandiva-glib/version.h.in b/c_glib/gandiva-glib/version.h.in
index 85cfe6d0cfb..3c9e87c9d52 100644
--- a/c_glib/gandiva-glib/version.h.in
+++ b/c_glib/gandiva-glib/version.h.in
@@ -119,6 +119,15 @@
  */
 #define GGANDIVA_VERSION_1_0 G_ENCODE_VERSION(1, 0)
 
+/**
+ * GGANDIVA_VERSION_4_0:
+ *
+ * You can use this macro value for compile time API version check.
+ *
+ * Since: 4.0.0
+ */
+#define GGANDIVA_VERSION_4_0 G_ENCODE_VERSION(4, 0)
+
 /**
  * GGANDIVA_VERSION_MIN_REQUIRED:
  *
@@ -166,6 +175,20 @@
 
 #define GGANDIVA_AVAILABLE_IN_ALL
 
+#if GGANDIVA_VERSION_MIN_REQUIRED >= GGANDIVA_VERSION_4_0
+#  define GGANDIVA_DEPRECATED_IN_4_0                GGANDIVA_DEPRECATED
+#  define GGANDIVA_DEPRECATED_IN_4_0_FOR(function)  GGANDIVA_DEPRECATED_FOR(function)
+#else
+#  define GGANDIVA_DEPRECATED_IN_4_0
+#  define GGANDIVA_DEPRECATED_IN_4_0_FOR(function)
+#endif
+
+#if GGANDIVA_VERSION_MAX_ALLOWED < GGANDIVA_VERSION_4_0
+#  define GGANDIVA_AVAILABLE_IN_4_0 GGANDIVA_UNAVAILABLE(4, 0)
+#else
+#  define GGANDIVA_AVAILABLE_IN_4_0
+#endif
+
 #if GGANDIVA_VERSION_MIN_REQUIRED >= GGANDIVA_VERSION_1_0
 #  define GGANDIVA_DEPRECATED_IN_1_0                GGANDIVA_DEPRECATED
 #  define GGANDIVA_DEPRECATED_IN_1_0_FOR(function)  GGANDIVA_DEPRECATED_FOR(function)
diff --git a/c_glib/parquet-glib/Makefile.am b/c_glib/parquet-glib/Makefile.am
deleted file mode 100644
index a813b3ce9cc..00000000000
--- a/c_glib/parquet-glib/Makefile.am
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CLEANFILES =
-DISTCLEANFILES =
-
-EXTRA_DIST =					\
-	meson.build
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(GARROW_CFLAGS)			\
-	$(GPARQUET_CFLAGS)
-
-if HAVE_PARQUET
-lib_LTLIBRARIES =				\
-	libparquet-glib.la
-
-libparquet_glib_la_CXXFLAGS =			\
-	$(GLIB_CFLAGS)				\
-	$(PARQUET_CFLAGS)			\
-	$(GARROW_CFLAGS)			\
-	$(GPARQUET_CXXFLAGS)
-
-libparquet_glib_la_LDFLAGS =			\
-	-version-info $(LT_VERSION_INFO)	\
-	-no-undefined
-
-libparquet_glib_la_LIBADD =			\
-	$(GLIB_LIBS)				\
-	../arrow-glib/libarrow-glib.la		\
-	$(PARQUET_LIBS)
-
-libparquet_glib_la_headers =			\
-	arrow-file-reader.h			\
-	arrow-file-writer.h			\
-	parquet-glib.h
-
-libparquet_glib_la_generated_headers =		\
-	version.h
-
-libparquet_glib_la_sources =			\
-	arrow-file-reader.cpp			\
-	arrow-file-writer.cpp			\
-	$(libparquet_glib_la_headers)		\
-	$(libparquet_glib_la_generated_headers)
-
-libparquet_glib_la_cpp_headers =		\
-	arrow-file-reader.hpp			\
-	arrow-file-writer.hpp			\
-	parquet-glib.hpp
-
-libparquet_glib_la_SOURCES =			\
-	$(libparquet_glib_la_sources)		\
-	$(libparquet_glib_la_cpp_headers)
-
-BUILT_SOURCES =					\
-	$(libparquet_glib_la_generated_headers)
-
-parquet_glib_includedir = $(includedir)/parquet-glib
-parquet_glib_include_HEADERS =			\
-	$(libparquet_glib_la_headers)		\
-	$(libparquet_glib_la_cpp_headers)
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA =				\
-	parquet-glib.pc
-
-# GObject Introspection
-if HAVE_INTROSPECTION
--include $(INTROSPECTION_MAKEFILE)
-INTROSPECTION_GIRS =
-INTROSPECTION_SCANNER_ARGS =
-INTROSPECTION_SCANNER_ENV =
-if USE_ARROW_BUILD_DIR
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH}
-else
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$${PKG_CONFIG_PATH}
-endif
-INTROSPECTION_COMPILER_ARGS =				\
-	--includedir=$(abs_top_builddir)/arrow-glib
-
-Parquet-1.0.gir: libparquet-glib.la
-Parquet_1_0_gir_PACKAGES =			\
-	arrow-glib
-Parquet_1_0_gir_EXPORT_PACKAGES =		\
-	parquet-glib
-Parquet_1_0_gir_INCLUDES =			\
-	Arrow-1.0
-Parquet_1_0_gir_CFLAGS =			\
-	$(AM_CPPFLAGS)
-Parquet_1_0_gir_LIBS =
-Parquet_1_0_gir_FILES = $(libparquet_glib_la_sources)
-Parquet_1_0_gir_SCANNERFLAGS =					\
-	--add-include-path=$(abs_top_builddir)/arrow-glib	\
-	--library-path=$(ARROW_LIB_DIR)				\
-	--warn-all						\
-	--identifier-prefix=GParquet				\
-	--symbol-prefix=gparquet
-if OS_MACOS
-Parquet_1_0_gir_LIBS +=				\
-	arrow-glib				\
-	parquet-glib
-Parquet_1_0_gir_SCANNERFLAGS +=					\
-	--no-libtool						\
-	--library-path=$(abs_top_builddir)/arrow-glib/.libs	\
-	--library-path=$(abs_builddir)/.libs
-else
-Parquet_1_0_gir_LIBS +=					\
-	$(abs_top_builddir)/arrow-glib/libarrow-glib.la	\
-	libparquet-glib.la
-endif
-INTROSPECTION_GIRS += Parquet-1.0.gir
-
-girdir = $(datadir)/gir-1.0
-gir_DATA = $(INTROSPECTION_GIRS)
-
-typelibdir = $(libdir)/girepository-1.0
-typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib)
-
-CLEANFILES +=					\
-	$(gir_DATA)				\
-	$(typelib_DATA)
-endif
-endif
diff --git a/c_glib/parquet-glib/parquet-glib.pc.in b/c_glib/parquet-glib/parquet-glib.pc.in
deleted file mode 100644
index 81559f1bce1..00000000000
--- a/c_glib/parquet-glib/parquet-glib.pc.in
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Apache Parquet GLib
-Description: C API for Apache Parquet based on GLib
-Version: @VERSION@
-Libs: -L${libdir} -lparquet-glib
-Cflags: -I${includedir}
-Requires: arrow-glib
diff --git a/c_glib/plasma-glib/Makefile.am b/c_glib/plasma-glib/Makefile.am
deleted file mode 100644
index 60499a4065f..00000000000
--- a/c_glib/plasma-glib/Makefile.am
+++ /dev/null
@@ -1,171 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CLEANFILES =
-DISTCLEANFILES =
-
-EXTRA_DIST =					\
-	meson.build
-
-AM_CPPFLAGS =					\
-	-I$(top_builddir)			\
-	-I$(top_srcdir)				\
-	-DG_LOG_DOMAIN=\"Plasma\"
-
-AM_CFLAGS =					\
-	$(GLIB_CFLAGS)				\
-	$(GARROW_CFLAGS)			\
-	$(GPLASMA_CFLAGS)
-
-PLASMA_ARROW_CUDA_LIBS =
-PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS =
-PLASMA_GIR_ARROW_CUDA_PACKAGE =
-PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH =
-PLASMA_GIR_ARROW_CUDA_LIBS_MACOS =
-PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS =
-PLASMA_GIR_ARROW_CUDA_LIBS =
-if HAVE_ARROW_CUDA
-PLASMA_ARROW_CUDA_LIBS +=				\
-	../arrow-cuda-glib/libarrow-cuda-glib.la	\
-	$(ARROW_CUDA_LIBS)
-PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS +=		\
-	--includedir=$(abs_top_builddir)/arrow-cuda-glib
-PLASMA_GIR_ARROW_CUDA_PACKAGE +=		\
-	arrow-cuda-glib
-PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH +=		\
-	--add-include-path=$(abs_top_builddir)/arrow-cuda-glib
-PLASMA_GIR_ARROW_CUDA_LIBS_MACOS +=		\
-	arrow-cuda-glib
-PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS +=			\
-	--library-path=$(abs_top_builddir)/arrow-cuda-glib/.libs
-PLASMA_GIR_ARROW_CUDA_LIBS +=						\
-	$(abs_top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la
-endif
-
-if HAVE_PLASMA
-lib_LTLIBRARIES =				\
-	libplasma-glib.la
-
-libplasma_glib_la_CXXFLAGS =			\
-	$(GLIB_CFLAGS)				\
-	$(PLASMA_CFLAGS)			\
-	$(GARROW_CFLAGS)			\
-	$(GPLASMA_CFLAGS)
-
-libplasma_glib_la_LDFLAGS =			\
-	-version-info $(LT_VERSION_INFO)	\
-	-no-undefined
-
-libplasma_glib_la_LIBADD =			\
-	$(GLIB_LIBS)				\
-	../arrow-glib/libarrow-glib.la		\
-	$(PLASMA_LIBS)				\
-	$(PLASMA_ARROW_CUDA_LIBS)
-
-libplasma_glib_la_headers =			\
-	client.h				\
-	object.h				\
-	plasma-glib.h
-
-libplasma_glib_la_sources =			\
-	client.cpp				\
-	object.cpp				\
-	$(libplasma_glib_la_headers)
-
-libplasma_glib_la_cpp_headers =			\
-	client.hpp				\
-	object.hpp				\
-	plasma-glib.hpp
-
-libplasma_glib_la_SOURCES =			\
-	$(libplasma_glib_la_sources)		\
-	$(libplasma_glib_la_cpp_headers)
-
-plasma_glib_includedir = $(includedir)/plasma-glib
-plasma_glib_include_HEADERS =			\
-	$(libplasma_glib_la_headers)		\
-	$(libplasma_glib_la_cpp_headers)
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA =				\
-	plasma-glib.pc
-
-# GObject Introspection
-if HAVE_INTROSPECTION
--include $(INTROSPECTION_MAKEFILE)
-INTROSPECTION_GIRS =
-INTROSPECTION_SCANNER_ARGS =
-INTROSPECTION_SCANNER_ENV =
-if USE_ARROW_BUILD_DIR
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH}
-else
-INTROSPECTION_SCANNER_ENV +=			\
-	PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$${PKG_CONFIG_PATH}
-endif
-INTROSPECTION_COMPILER_ARGS =					\
-	--includedir=$(abs_top_builddir)/arrow-glib		\
-	$(PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_INCLUDEDIR)
-
-Plasma-1.0.gir: libplasma-glib.la
-Plasma_1_0_gir_PACKAGES =			\
-	arrow-glib				\
-	$(PLASMA_GIR_ARROW_CUDA_PACKAGE)
-Plasma_1_0_gir_EXPORT_PACKAGES =		\
-	plasma-glib
-Plasma_1_0_gir_INCLUDES =			\
-	Arrow-1.0
-Plasma_1_0_gir_CFLAGS =			\
-	$(AM_CPPFLAGS)
-Plasma_1_0_gir_LIBS =
-Plasma_1_0_gir_FILES = $(libplasma_glib_la_sources)
-Plasma_1_0_gir_SCANNERFLAGS =					\
-	--add-include-path=$(abs_top_builddir)/arrow-glib	\
-	$(PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH)	\
-	--library-path=$(ARROW_LIB_DIR)				\
-	--warn-all						\
-	--identifier-prefix=GPlasma				\
-	--symbol-prefix=gplasma
-if OS_MACOS
-Plasma_1_0_gir_LIBS +=				\
-	arrow-glib				\
-	$(PLASMA_GIR_ARROW_CUDA_LIBS_MACOS)	\
-	plasma-glib
-Plasma_1_0_gir_SCANNERFLAGS +=					\
-	--no-libtool						\
-	--library-path=$(abs_top_builddir)/arrow-glib/.libs	\
-	$(PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS)	\
-	--library-path=$(abs_builddir)/.libs
-else
-Plasma_1_0_gir_LIBS +=					\
-	$(abs_top_builddir)/arrow-glib/libarrow-glib.la	\
-	$(PLASMA_GIR_ARROW_CUDA_LIBS)			\
-	libplasma-glib.la
-endif
-INTROSPECTION_GIRS += Plasma-1.0.gir
-
-girdir = $(datadir)/gir-1.0
-gir_DATA = $(INTROSPECTION_GIRS)
-
-typelibdir = $(libdir)/girepository-1.0
-typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib)
-
-CLEANFILES +=					\
-	$(gir_DATA)				\
-	$(typelib_DATA)
-endif
-endif
diff --git a/c_glib/plasma-glib/plasma-glib.pc.in b/c_glib/plasma-glib/plasma-glib.pc.in
deleted file mode 100644
index c82fe69580f..00000000000
--- a/c_glib/plasma-glib/plasma-glib.pc.in
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Apache Arrow Plasma GLib
-Description: C API for Apache Arrow Plasma based on GLib
-Version: @VERSION@
-Libs: -L${libdir} -lplasma-glib
-Cflags: -I${includedir}
-Requires: plasma arrow-glib @ARROW_CUDA_GLIB_PACKAGE@
diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in b/c_glib/test/gandiva/test-condition.rb
similarity index 51%
rename from c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in
rename to c_glib/test/gandiva/test-condition.rb
index ee7e13967df..51fb9f1b160 100644
--- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.pc.in
+++ b/c_glib/test/gandiva/test-condition.rb
@@ -15,14 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
+class TestGandivaCondition < Test::Unit::TestCase
+  def setup
+    omit("Gandiva is required") unless defined?(::Gandiva)
+    field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new)
+    field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new)
+    field1_node = Gandiva::FieldNode.new(field1)
+    field2_node = Gandiva::FieldNode.new(field2)
+    function_node = Gandiva::FunctionNode.new("equal",
+                                              [field1_node, field2_node],
+                                              Arrow::BooleanDataType.new)
+    @condition = Gandiva::Condition.new(function_node)
+  end
 
-Name: Apache Arrow Dataset GLib
-Description: C API for Apache Arrow Dataset based on GLib
-Version: @VERSION@
-Libs: -L${libdir} -larrow-dataset-glib
-Cflags: -I${includedir}
-Requires: arrow-glib arrow-dataset
+  def test_to_s
+    assert_equal("bool equal((int32) field1, (int32) field2)",
+                 @condition.to_s)
+  end
+end
diff --git a/c_glib/test/gandiva/test-filter.rb b/c_glib/test/gandiva/test-filter.rb
new file mode 100644
index 00000000000..3da77743174
--- /dev/null
+++ b/c_glib/test/gandiva/test-filter.rb
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestGandivaFilter < Test::Unit::TestCase
+  include Helper::Buildable
+
+  def setup
+    omit("Gandiva is required") unless defined?(::Gandiva)
+
+    field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new)
+    field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new)
+    schema = Arrow::Schema.new([field1, field2])
+    field_node1 = Gandiva::FieldNode.new(field1)
+    field_node2 = Gandiva::FieldNode.new(field2)
+    equal_function_node =
+      Gandiva::FunctionNode.new("equal",
+                                [field_node1, field_node2],
+                                Arrow::BooleanDataType.new)
+    condition = Gandiva::Condition.new(equal_function_node)
+    @filter = Gandiva::Filter.new(schema, condition)
+
+    input_arrays = [
+      build_int32_array([1, 2, 3, 4]),
+      build_int32_array([11, 2, 15, 4]),
+    ]
+    @record_batch = Arrow::RecordBatch.new(schema,
+                                           input_arrays[0].length,
+                                           input_arrays)
+  end
+
+  def test_evaluate
+    selection_vector = Gandiva::UInt16SelectionVector.new(@record_batch.n_rows)
+    @filter.evaluate(@record_batch, selection_vector)
+    assert_equal(build_uint16_array([1, 3]),
+                 selection_vector.to_array)
+  end
+end
diff --git a/c_glib/test/gandiva/test-projector.rb b/c_glib/test/gandiva/test-projector.rb
index 4d3375659ae..308e1c3a5c9 100644
--- a/c_glib/test/gandiva/test-projector.rb
+++ b/c_glib/test/gandiva/test-projector.rb
@@ -20,33 +20,40 @@ class TestGandivaProjector < Test::Unit::TestCase
 
   def setup
     omit("Gandiva is required") unless defined?(::Gandiva)
-  end
 
-  def test_evaluate
     field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new)
     field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new)
-    schema = Arrow::Schema.new([field1, field2])
-    field_node1 = Gandiva::FieldNode.new(field1)
-    field_node2 = Gandiva::FieldNode.new(field2)
-    add_function_node = Gandiva::FunctionNode.new("add",
-                                                  [field_node1, field_node2],
-                                                  Arrow::Int32DataType.new)
-    subtract_function_node = Gandiva::FunctionNode.new("subtract",
-                                                       [field_node1, field_node2],
-                                                       Arrow::Int32DataType.new)
+    @schema = Arrow::Schema.new([field1, field2])
+    @field_node1 = Gandiva::FieldNode.new(field1)
+    @field_node2 = Gandiva::FieldNode.new(field2)
+    add_function_node =
+      Gandiva::FunctionNode.new("add",
+                                [@field_node1, @field_node2],
+                                Arrow::Int32DataType.new)
+    subtract_function_node =
+      Gandiva::FunctionNode.new("subtract",
+                                [@field_node1, @field_node2],
+                                Arrow::Int32DataType.new)
     add_result = Arrow::Field.new("add_result", Arrow::Int32DataType.new)
     add_expression = Gandiva::Expression.new(add_function_node, add_result)
-    subtract_result = Arrow::Field.new("subtract_result", Arrow::Int32DataType.new)
-    subtract_expression = Gandiva::Expression.new(subtract_function_node, subtract_result)
+    subtract_result = Arrow::Field.new("subtract_result",
+                                       Arrow::Int32DataType.new)
+    subtract_expression = Gandiva::Expression.new(subtract_function_node,
+                                                  subtract_result)
+    @projector = Gandiva::Projector.new(@schema,
+                                        [add_expression, subtract_expression])
 
-    projector = Gandiva::Projector.new(schema,
-                                       [add_expression, subtract_expression])
     input_arrays = [
       build_int32_array([1, 2, 3, 4]),
       build_int32_array([11, 13, 15, 17]),
     ]
-    record_batch = Arrow::RecordBatch.new(schema, 4, input_arrays)
-    outputs = projector.evaluate(record_batch)
+    @record_batch = Arrow::RecordBatch.new(@schema,
+                                           input_arrays[0].length,
+                                           input_arrays)
+  end
+
+  def test_evaluate
+    outputs = @projector.evaluate(@record_batch)
     assert_equal([
                    [12, 15, 18, 21],
                    [-10, -11, -12, -13],
diff --git a/c_glib/test/gandiva/test-selectable-projector.rb b/c_glib/test/gandiva/test-selectable-projector.rb
new file mode 100644
index 00000000000..47b0059a2ef
--- /dev/null
+++ b/c_glib/test/gandiva/test-selectable-projector.rb
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestGandivaSelectableProjector < Test::Unit::TestCase
+  include Helper::Buildable
+
+  def setup
+    omit("Gandiva is required") unless defined?(::Gandiva)
+
+    field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new)
+    field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new)
+    @schema = Arrow::Schema.new([field1, field2])
+
+    input_arrays = [
+      build_int32_array([1, 2, 3, 4]),
+      build_int32_array([11, 13, 15, 17]),
+    ]
+    @record_batch = Arrow::RecordBatch.new(@schema,
+                                           input_arrays[0].length,
+                                           input_arrays)
+
+    @field_node1 = Gandiva::FieldNode.new(field1)
+    @field_node2 = Gandiva::FieldNode.new(field2)
+    add_function_node =
+      Gandiva::FunctionNode.new("add",
+                                [@field_node1, @field_node2],
+                                Arrow::Int32DataType.new)
+    subtract_function_node =
+      Gandiva::FunctionNode.new("subtract",
+                                [@field_node1, @field_node2],
+                                Arrow::Int32DataType.new)
+    add_result = Arrow::Field.new("add_result", Arrow::Int32DataType.new)
+    add_expression = Gandiva::Expression.new(add_function_node, add_result)
+    subtract_result = Arrow::Field.new("subtract_result",
+                                       Arrow::Int32DataType.new)
+    subtract_expression = Gandiva::Expression.new(subtract_function_node,
+                                                  subtract_result)
+    @selection_vector = Gandiva::UInt16SelectionVector.new(@record_batch.n_rows)
+    @projector =
+      Gandiva::SelectableProjector.new(@schema,
+                                       [add_expression, subtract_expression],
+                                       @selection_vector.mode)
+  end
+
+  def test_evaluate
+    two_node = Gandiva::Int32LiteralNode.new(2)
+    condition_node = Gandiva::FunctionNode.new("greater_than",
+                                               [@field_node1, two_node],
+                                               Arrow::BooleanDataType.new)
+    condition = Gandiva::Condition.new(condition_node)
+    filter = Gandiva::Filter.new(@schema, condition)
+    filter.evaluate(@record_batch, @selection_vector)
+    outputs = @projector.evaluate(@record_batch, @selection_vector)
+    assert_equal([
+                   [18, 21],
+                   [-12, -13],
+                 ],
+                 outputs.collect(&:values))
+  end
+end
diff --git a/dev/release/source/build.sh b/c_glib/test/gandiva/test-selection-vector.rb
old mode 100755
new mode 100644
similarity index 53%
rename from dev/release/source/build.sh
rename to c_glib/test/gandiva/test-selection-vector.rb
index 558600e1fb7..ca5042c2874
--- a/dev/release/source/build.sh
+++ b/c_glib/test/gandiva/test-selection-vector.rb
@@ -1,5 +1,3 @@
-#!/bin/bash
-#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,17 +15,28 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
+class TestGandivaSelectionVector < Test::Unit::TestCase
+  include Helper::Buildable
+
+  def setup
+    omit("Gandiva is required") unless defined?(::Gandiva)
+  end
 
-archive_name=$1
-c_glib_including_configure_tar_gz=$2
+  def test_uint16
+    selection_vector = Gandiva::UInt16SelectionVector.new(10)
+    assert_equal(build_uint16_array([]),
+                 selection_vector.to_array)
+  end
 
-tar xf /arrow/${archive_name}.tar
+  def test_uint32
+    selection_vector = Gandiva::UInt32SelectionVector.new(10)
+    assert_equal(build_uint32_array([]),
+                 selection_vector.to_array)
+  end
 
-# Run autogen.sh to create c_glib/ source archive containing the configure script
-cd ${archive_name}/c_glib
-./autogen.sh
-rm -rf autom4te.cache
-cd -
-mv ${archive_name}/c_glib/ c_glib/
-tar czf /arrow/${c_glib_including_configure_tar_gz} c_glib
+  def test_uint64
+    selection_vector = Gandiva::UInt64SelectionVector.new(10)
+    assert_equal(build_uint64_array([]),
+                 selection_vector.to_array)
+  end
+end
diff --git a/ci/docker/ubuntu-16.04-cpp.dockerfile b/ci/docker/ubuntu-16.04-cpp.dockerfile
deleted file mode 100644
index 5c98ae30e1e..00000000000
--- a/ci/docker/ubuntu-16.04-cpp.dockerfile
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ARG base=amd64/ubuntu:16.04
-FROM ${base}
-
-SHELL ["/bin/bash", "-o", "pipefail", "-c"]
-
-ENV DEBIAN_FRONTEND noninteractive
-
-# LLVM 10 or later requires C++ 14 but g++-5's C++ 14 support is limited.
-# cpp/src/arrow/vendored/datetime/date.h doesn't work.
-# ARG llvm
-ENV llvm=8
-RUN apt-get update -y -q && \
-    apt-get install -y -q --no-install-recommends \
-        apt-transport-https \
-        software-properties-common \
-        wget && \
-    wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-    apt-add-repository -y "deb https://apt.llvm.org/xenial/ llvm-toolchain-xenial-${llvm} main" && \
-    apt-get update -y -q && \
-    apt-get install -y -q --no-install-recommends \
-        autoconf \
-        ca-certificates \
-        ccache \
-        clang-${llvm} \
-        cmake \
-        g++ \
-        gcc \
-        gdb \
-        git \
-        libboost-all-dev \
-        libbrotli-dev \
-        libbz2-dev \
-        libgoogle-glog-dev \
-        liblz4-dev \
-        libre2-dev \
-        libssl-dev \
-        libutf8proc-dev \
-        libzstd1-dev \
-        llvm-${llvm}-dev \
-        make \
-        ninja-build \
-        pkg-config \
-        protobuf-compiler \
-        python3 \
-        tzdata && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Benchmark is deactivated as the external project requires CMake 3.6+
-# Gandiva JNI is deactivated as it requires CMake 3.11+
-# - c-ares in Xenial isn't recognized by gRPC build system
-# - libprotobuf-dev / libprotoc-dev in Xenial too old for gRPC
-# - libboost-all-dev does not include Boost.Process, needed for Flight
-#   unit tests, so doing vendored build by default
-ENV ARROW_BUILD_BENCHMARKS=OFF \
-    ARROW_BUILD_TESTS=ON \
-    ARROW_DATASET=ON \
-    ARROW_DEPENDENCY_SOURCE=SYSTEM \
-    ARROW_GANDIVA_JAVA=OFF \
-    ARROW_GANDIVA=ON \
-    ARROW_HOME=/usr/local \
-    ARROW_PARQUET=ON \
-    ARROW_USE_CCACHE=ON \
-    ARROW_WITH_BROTLI=ON \
-    ARROW_WITH_BZ2=ON \
-    ARROW_WITH_LZ4=ON \
-    ARROW_WITH_SNAPPY=ON \
-    ARROW_WITH_ZLIB=ON \
-    ARROW_WITH_ZSTD=ON \
-    BOOST_SOURCE=BUNDLED \
-    cares_SOURCE=BUNDLED \
-    CC=gcc \
-    CXX=g++ \
-    gRPC_SOURCE=BUNDLED \
-    GTest_SOURCE=BUNDLED \
-    ORC_SOURCE=BUNDLED \
-    PARQUET_BUILD_EXAMPLES=ON \
-    PARQUET_BUILD_EXECUTABLES=ON \
-    PATH=/usr/lib/ccache/:$PATH \
-    Protobuf_SOURCE=BUNDLED \
-    RapidJSON_SOURCE=BUNDLED \
-    Snappy_SOURCE=BUNDLED \
-    Thrift_SOURCE=BUNDLED
diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 1d9e41bba7a..c5b55eef42a 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -79,8 +79,10 @@ build() {
     export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include"
     export LIBS="-L${MINGW_PREFIX}/libs"
     export ARROW_S3=OFF
+    export ARROW_WITH_RE2=OFF
   else
     export ARROW_S3=ON
+    export ARROW_WITH_RE2=ON
   fi
 
   MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \
@@ -105,6 +107,7 @@ build() {
     -DARROW_SNAPPY_USE_SHARED=OFF \
     -DARROW_USE_GLOG=OFF \
     -DARROW_WITH_LZ4=ON \
+    -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \
     -DARROW_WITH_SNAPPY=ON \
     -DARROW_WITH_ZLIB=ON \
     -DARROW_WITH_ZSTD=ON \
diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh
index bb53bd82131..7093be4d238 100755
--- a/ci/scripts/go_build.sh
+++ b/ci/scripts/go_build.sh
@@ -27,3 +27,10 @@ go get -d -t -v ./...
 go install -v ./...
 
 popd
+
+pushd ${source_dir}/parquet
+
+go get -d -t -v ./...
+go install -v ./...
+
+popd
diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh
index 077749fc945..7dd873df3e1 100755
--- a/ci/scripts/go_test.sh
+++ b/ci/scripts/go_test.sh
@@ -28,3 +28,11 @@ for d in $(go list ./... | grep -v vendor); do
 done
 
 popd
+
+pushd ${source_dir}/parquet
+
+for d in $(go list ./... | grep -v vendor); do
+    go test $d
+done
+
+popd
diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh
index aa23e5b7c18..5d2e71916ed 100755
--- a/ci/scripts/integration_arrow.sh
+++ b/ci/scripts/integration_arrow.sh
@@ -33,3 +33,4 @@ archery integration --with-all --run-flight \
     --gold-dirs=$gold_dir/1.0.0-bigendian \
     --gold-dirs=$gold_dir/1.0.0-littleendian \
     --gold-dirs=$gold_dir/2.0.0-compression \
+    --gold-dirs=$gold_dir/4.0.0-shareddict \
diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh
index 3f451e96b83..cb6ca30a64e 100755
--- a/ci/scripts/msys2_setup.sh
+++ b/ci/scripts/msys2_setup.sh
@@ -61,6 +61,7 @@ esac
 pacman \
   --needed \
   --noconfirm \
+  --refresh \
   --sync \
   "${packages[@]}"
 
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
new file mode 100755
index 00000000000..7a021f70f74
--- /dev/null
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+source_dir=${1}
+build_dir=${2}
+
+echo "=== (${PYTHON_VERSION}) Clear output directories and leftovers ==="
+# Clear output directories and leftovers
+rm -rf ${build_dir}/install
+rm -rf ${source_dir}/python/dist
+rm -rf ${source_dir}/python/build
+rm -rf ${source_dir}/python/repaired_wheels
+rm -rf ${source_dir}/python/pyarrow/*.so
+rm -rf ${source_dir}/python/pyarrow/*.so.*
+
+echo "=== (${PYTHON_VERSION}) Set OSX SDK and C flags ==="
+# Arrow is 64-bit-only at the moment
+export CFLAGS="-fPIC -arch x86_64 ${CFLAGS//-arch i386/}"
+export CXXFLAGS="-fPIC -arch x86_64 ${CXXFLAGS//-arch i386} -std=c++11"
+export SDKROOT="$(xcrun --show-sdk-path)"
+
+echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ==="
+: ${ARROW_DATASET:=ON}
+: ${ARROW_FLIGHT:=ON}
+: ${ARROW_GANDIVA:=OFF}
+: ${ARROW_HDFS:=ON}
+: ${ARROW_JEMALLOC:=ON}
+: ${ARROW_MIMALLOC:=ON}
+: ${ARROW_ORC:=ON}
+: ${ARROW_PARQUET:=ON}
+: ${ARROW_PLASMA:=ON}
+: ${ARROW_S3:=ON}
+: ${ARROW_TENSORFLOW:=ON}
+: ${ARROW_WITH_BROTLI:=ON}
+: ${ARROW_WITH_BZ2:=ON}
+: ${ARROW_WITH_LZ4:=ON}
+: ${ARROW_WITH_SNAPPY:=ON}
+: ${ARROW_WITH_ZLIB:=ON}
+: ${ARROW_WITH_ZSTD:=ON}
+: ${CMAKE_BUILD_TYPE:=release}
+: ${CMAKE_GENERATOR:=Ninja}
+: ${VCPKG_FEATURE_FLAGS:=-manifests}
+: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-osx-static-${CMAKE_BUILD_TYPE}}}
+
+mkdir -p ${build_dir}/build
+pushd ${build_dir}/build
+cmake \
+    -DARROW_BUILD_SHARED=ON \
+    -DARROW_BUILD_STATIC=OFF \
+    -DARROW_BUILD_TESTS=OFF \
+    -DARROW_DATASET=${ARROW_DATASET} \
+    -DARROW_DEPENDENCY_SOURCE="VCPKG" \
+    -DARROW_DEPENDENCY_USE_SHARED=OFF \
+    -DARROW_FLIGHT==${ARROW_FLIGHT} \
+    -DARROW_GANDIVA=${ARROW_GANDIVA} \
+    -DARROW_HDFS=${ARROW_HDFS} \
+    -DARROW_JEMALLOC=${ARROW_JEMALLOC} \
+    -DARROW_MIMALLOC=${ARROW_MIMALLOC} \
+    -DARROW_ORC=${ARROW_ORC} \
+    -DARROW_PACKAGE_KIND="manylinux${MANYLINUX_VERSION}" \
+    -DARROW_PARQUET=${ARROW_PARQUET} \
+    -DARROW_PLASMA=${ARROW_PLASMA} \
+    -DARROW_PYTHON=ON \
+    -DARROW_RPATH_ORIGIN=ON \
+    -DARROW_S3=${ARROW_S3} \
+    -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \
+    -DARROW_USE_CCACHE=ON \
+    -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \
+    -DARROW_WITH_BZ2=${ARROW_WITH_BZ2} \
+    -DARROW_WITH_LZ4=${ARROW_WITH_LZ4} \
+    -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY} \
+    -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB} \
+    -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD} \
+    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
+    -DCMAKE_INSTALL_LIBDIR=lib \
+    -DCMAKE_INSTALL_PREFIX=${build_dir}/install \
+    -DCMAKE_UNITY_BUILD=ON \
+    -DOPENSSL_USE_STATIC_LIBS=ON \
+    -DVCPKG_MANIFEST_MODE=OFF \
+    -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \
+    -G ${CMAKE_GENERATOR} \
+    ${source_dir}/cpp
+cmake --build . --target install
+popd
+
+# Check that we don't expose any unwanted symbols
+# check_arrow_visibility
+
+echo "=== (${PYTHON_VERSION}) Building wheel ==="
+export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+export PYARROW_BUNDLE_ARROW_CPP=1
+export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR}
+export PYARROW_INSTALL_TESTS=1
+export PYARROW_WITH_DATASET=${ARROW_DATASET}
+export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT}
+export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA}
+export PYARROW_WITH_HDFS=${ARROW_HDFS}
+export PYARROW_WITH_ORC=${ARROW_ORC}
+export PYARROW_WITH_PARQUET=${ARROW_PARQUET}
+export PYARROW_WITH_PLASMA=${ARROW_PLASMA}
+export PYARROW_WITH_S3=${ARROW_S3}
+# PyArrow build configuration
+export PKG_CONFIG_PATH=/usr/lib/pkgconfig:${build_dir}/install/lib/pkgconfig
+
+pushd ${source_dir}/python
+python setup.py bdist_wheel
+popd
+
+echo "=== (${PYTHON_VERSION}) Show dynamic libraries the wheel depend on ==="
+deps=$(delocate-listdeps ${source_dir}/python/dist/*.whl)
+
+if echo $deps | grep -v "^@rpath/lib\(arrow\|gandiva\|parquet\|plasma\)"; then
+  echo "There are non-bundled shared library dependencies."
+  exit 1
+fi
diff --git a/ci/scripts/python_wheel_macos_test.sh b/ci/scripts/python_wheel_macos_test.sh
new file mode 100755
index 00000000000..6ac8576d484
--- /dev/null
+++ b/ci/scripts/python_wheel_macos_test.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+source_dir=${1}
+
+: ${ARROW_S3:=ON}
+
+export PYARROW_TEST_CYTHON=OFF
+export PYARROW_TEST_DATASET=ON
+export PYARROW_TEST_GANDIVA=OFF
+export PYARROW_TEST_HDFS=ON
+export PYARROW_TEST_ORC=ON
+export PYARROW_TEST_PANDAS=ON
+export PYARROW_TEST_PARQUET=ON
+export PYARROW_TEST_PLASMA=ON
+export PYARROW_TEST_S3=${ARROW_S3}
+export PYARROW_TEST_TENSORFLOW=ON
+export PYARROW_TEST_FLIGHT=ON
+
+export ARROW_TEST_DATA=${source_dir}/testing/data
+export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data
+
+# Install the built wheels
+pip install ${source_dir}/python/dist/*.whl
+
+# Test that the modules are importable
+python -c "
+import pyarrow
+import pyarrow._hdfs
+import pyarrow.csv
+import pyarrow.dataset
+import pyarrow.flight
+import pyarrow.fs
+import pyarrow.json
+import pyarrow.orc
+import pyarrow.parquet
+import pyarrow.plasma
+"
+
+if [ "${PYARROW_TEST_S3}" == "ON" ]; then
+  python -c "import pyarrow._s3fs"
+fi
+
+# Install testing dependencies
+pip install -r ${source_dir}/python/requirements-wheel-test.txt
+
+# Execute unittest
+pytest -r s --pyargs pyarrow
diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh
index eacee5f17f3..89963eb2dd8 100755
--- a/ci/scripts/r_sanitize.sh
+++ b/ci/scripts/r_sanitize.sh
@@ -27,7 +27,7 @@ pushd ${source_dir}/tests
 
 export TEST_R_WITH_ARROW=TRUE
 export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp"
-${R_BIN} < testthat.R > testthat.out 2>&1
+${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; }
 
 cat testthat.out
 if grep -q "runtime error" testthat.out; then
diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh
index 151e71b8a7c..d447bdf23dd 100755
--- a/ci/scripts/r_test.sh
+++ b/ci/scripts/r_test.sh
@@ -62,7 +62,7 @@ BEFORE=$(ls -alh ~/)
 
 SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true')
   if (as_cran) {
-    rcmdcheck::rcmdcheck(args = c('--as-cran', '--run-donttest'), error_on = 'warning', check_dir = 'check')
+    rcmdcheck::rcmdcheck(args = c('--as-cran', '--run-donttest'), error_on = 'warning', check_dir = 'check', timeout = 3600)
   } else {
     if (nzchar(Sys.which('minio'))) {
       message('Running minio for S3 tests (if build supports them)')
@@ -71,7 +71,7 @@ SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true')
       pid <- sys::exec_background('minio', c('server', minio_dir))
       on.exit(tools::pskill(pid))
     }
-    rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check')
+    rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check', timeout = 3600)
   }"
 echo "$SCRIPT" | ${R_BIN} --no-save
 
diff --git a/ci/vcpkg/arm64-linux-static-debug.cmake b/ci/vcpkg/arm64-linux-static-debug.cmake
index 5d77b8df7fa..6fea43694cd 100644
--- a/ci/vcpkg/arm64-linux-static-debug.cmake
+++ b/ci/vcpkg/arm64-linux-static-debug.cmake
@@ -22,5 +22,7 @@ set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(VCPKG_BUILD_TYPE debug)
 
 if(NOT CMAKE_HOST_SYSTEM_PROCESSOR)
-    execute_process(COMMAND "uname" "-m" OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR OUTPUT_STRIP_TRAILING_WHITESPACE)
+  execute_process(COMMAND "uname" "-m"
+                  OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
 endif()
diff --git a/ci/vcpkg/arm64-linux-static-release.cmake b/ci/vcpkg/arm64-linux-static-release.cmake
index ebe5bc3fa04..4012848b849 100644
--- a/ci/vcpkg/arm64-linux-static-release.cmake
+++ b/ci/vcpkg/arm64-linux-static-release.cmake
@@ -22,5 +22,7 @@ set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(VCPKG_BUILD_TYPE release)
 
 if(NOT CMAKE_HOST_SYSTEM_PROCESSOR)
-    execute_process(COMMAND "uname" "-m" OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR OUTPUT_STRIP_TRAILING_WHITESPACE)
+  execute_process(COMMAND "uname" "-m"
+                  OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
 endif()
diff --git a/c_glib/autogen.sh b/ci/vcpkg/x64-osx-static-debug.cmake
old mode 100755
new mode 100644
similarity index 79%
rename from c_glib/autogen.sh
rename to ci/vcpkg/x64-osx-static-debug.cmake
index eeca380bea8..e8a321ec71a
--- a/c_glib/autogen.sh
+++ b/ci/vcpkg/x64-osx-static-debug.cmake
@@ -1,5 +1,3 @@
-#!/bin/sh
-#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,10 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -u
-set -e
+set(VCPKG_TARGET_ARCHITECTURE x64)
+set(VCPKG_CRT_LINKAGE dynamic)
+set(VCPKG_LIBRARY_LINKAGE static)
 
-mkdir -p m4
+set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
+set(VCPKG_OSX_ARCHITECTURES x86_64)
 
-gtkdocize --copy
-autoreconf --install --force
+set(VCPKG_BUILD_TYPE debug)
diff --git a/c_glib/arrow-glib/arrow-orc-glib.pc.in b/ci/vcpkg/x64-osx-static-release.cmake
similarity index 79%
rename from c_glib/arrow-glib/arrow-orc-glib.pc.in
rename to ci/vcpkg/x64-osx-static-release.cmake
index 8e45d402549..956d5b92e73 100644
--- a/c_glib/arrow-glib/arrow-orc-glib.pc.in
+++ b/ci/vcpkg/x64-osx-static-release.cmake
@@ -15,12 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
+set(VCPKG_TARGET_ARCHITECTURE x64)
+set(VCPKG_CRT_LINKAGE dynamic)
+set(VCPKG_LIBRARY_LINKAGE static)
 
-Name: Apache Arrow ORC GLib
-Description: ORC modules for Apache Arrow GLib
-Version: @VERSION@
-Requires: arrow-glib
+set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
+set(VCPKG_OSX_ARCHITECTURES x86_64)
+
+set(VCPKG_BUILD_TYPE release)
diff --git a/cpp/cmake_modules/FindBoostAlt.cmake b/cpp/cmake_modules/FindBoostAlt.cmake
index 123c6dda1c7..1771937125e 100644
--- a/cpp/cmake_modules/FindBoostAlt.cmake
+++ b/cpp/cmake_modules/FindBoostAlt.cmake
@@ -38,16 +38,14 @@ if(ARROW_BOOST_USE_SHARED)
   set(BUILD_SHARED_LIBS_KEEP ${BUILD_SHARED_LIBS})
   set(BUILD_SHARED_LIBS ON)
 
-  find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS}
-               COMPONENTS system filesystem)
+  find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} COMPONENTS system filesystem)
   set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP})
   unset(BUILD_SHARED_LIBS_KEEP)
 else()
   # Find static boost headers and libs
   # TODO Differentiate here between release and debug builds
   set(Boost_USE_STATIC_LIBS ON)
-  find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS}
-               COMPONENTS system filesystem)
+  find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS} COMPONENTS system filesystem)
 endif()
 
 if(Boost_FOUND)
diff --git a/cpp/cmake_modules/FindORC.cmake b/cpp/cmake_modules/FindORC.cmake
index 1be149c93b2..061a0df2e9e 100644
--- a/cpp/cmake_modules/FindORC.cmake
+++ b/cpp/cmake_modules/FindORC.cmake
@@ -44,10 +44,9 @@ if(ORC_STATIC_LIB AND ORC_INCLUDE_DIR)
   add_library(orc::liborc STATIC IMPORTED)
   set_target_properties(orc::liborc
                         PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}"
-                                   INTERFACE_INCLUDE_DIRECTORIES
-                                   "${ORC_INCLUDE_DIR}")
+                                   INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}")
 else()
-  if (ORC_FIND_REQUIRED)
+  if(ORC_FIND_REQUIRED)
     message(FATAL_ERROR "ORC library was required in toolchain and unable to locate")
   endif()
   set(ORC_FOUND FALSE)
diff --git a/cpp/cmake_modules/FindSnappy.cmake b/cpp/cmake_modules/FindSnappy.cmake
index 5784cf59220..26cccb786c5 100644
--- a/cpp/cmake_modules/FindSnappy.cmake
+++ b/cpp/cmake_modules/FindSnappy.cmake
@@ -26,9 +26,13 @@ if(ARROW_SNAPPY_USE_SHARED)
 else()
   set(SNAPPY_STATIC_LIB_NAME_BASE "snappy")
   if(MSVC)
-    set(SNAPPY_STATIC_LIB_NAME_BASE "${SNAPPY_STATIC_LIB_NAME_BASE}${SNAPPY_MSVC_STATIC_LIB_SUFFIX}")
+    set(SNAPPY_STATIC_LIB_NAME_BASE
+        "${SNAPPY_STATIC_LIB_NAME_BASE}${SNAPPY_MSVC_STATIC_LIB_SUFFIX}")
   endif()
-  set(SNAPPY_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  set(
+    SNAPPY_LIB_NAMES
+    "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}"
+    )
 endif()
 
 if(Snappy_ROOT)
@@ -44,7 +48,9 @@ if(Snappy_ROOT)
             PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES})
 else()
   find_library(Snappy_LIB NAMES ${SNAPPY_LIB_NAMES})
-  find_path(Snappy_INCLUDE_DIR NAMES snappy.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES})
+  find_path(Snappy_INCLUDE_DIR
+            NAMES snappy.h
+            PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES})
 endif()
 
 find_package_handle_standard_args(Snappy REQUIRED_VARS Snappy_LIB Snappy_INCLUDE_DIR)
diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake
index 560321df5db..edea73b8dae 100644
--- a/cpp/cmake_modules/Findutf8proc.cmake
+++ b/cpp/cmake_modules/Findutf8proc.cmake
@@ -29,37 +29,40 @@ else()
   endif()
   set(utf8proc_STATIC_LIB_SUFFIX
       "${utf8proc_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(utf8proc_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${utf8proc_STATIC_LIB_SUFFIX}")
+  set(utf8proc_LIB_NAMES
+      "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${utf8proc_STATIC_LIB_SUFFIX}")
 endif()
 
 if(utf8proc_ROOT)
-  find_library(
-    utf8proc_LIB
-    NAMES ${utf8proc_LIB_NAMES}
-    PATHS ${utf8proc_ROOT}
-    PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}
-    NO_DEFAULT_PATH)
+  find_library(utf8proc_LIB
+               NAMES ${utf8proc_LIB_NAMES}
+               PATHS ${utf8proc_ROOT}
+               PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}
+               NO_DEFAULT_PATH)
   find_path(utf8proc_INCLUDE_DIR
             NAMES utf8proc.h
             PATHS ${utf8proc_ROOT}
             NO_DEFAULT_PATH
             PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES})
 else()
-  find_library(
-    utf8proc_LIB
-    NAMES ${utf8proc_LIB_NAMES}
-    PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES})
-  find_path(utf8proc_INCLUDE_DIR NAMES utf8proc.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES})
+  find_library(utf8proc_LIB
+               NAMES ${utf8proc_LIB_NAMES}
+               PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES})
+  find_path(utf8proc_INCLUDE_DIR
+            NAMES utf8proc.h
+            PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES})
 endif()
 
-find_package_handle_standard_args(utf8proc REQUIRED_VARS utf8proc_LIB utf8proc_INCLUDE_DIR)
+find_package_handle_standard_args(utf8proc REQUIRED_VARS utf8proc_LIB
+                                  utf8proc_INCLUDE_DIR)
 
 if(utf8proc_FOUND)
   set(utf8proc_FOUND TRUE)
   add_library(utf8proc::utf8proc UNKNOWN IMPORTED)
-  set_target_properties(utf8proc::utf8proc
-                        PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}"
-                                   INTERFACE_INCLUDE_DIRECTORIES "${utf8proc_INCLUDE_DIR}")
+  set_target_properties(
+    utf8proc::utf8proc
+    PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}" INTERFACE_INCLUDE_DIRECTORIES
+               "${utf8proc_INCLUDE_DIR}")
   if(NOT ARROW_UTF8PROC_USE_SHARED)
     set_target_properties(utf8proc::utf8proc
                           PROPERTIES INTERFACE_COMPILER_DEFINITIONS "UTF8PROC_STATIC")
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index b534552c3c0..9f68c560472 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -451,7 +451,9 @@ if(ARROW_CPU_FLAG STREQUAL "armv8")
   endif()
   set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}")
 
-  add_definitions(-DARROW_HAVE_NEON)
+  if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE")
+    add_definitions(-DARROW_HAVE_NEON)
+  endif()
 
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
      AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4")
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index bfa3ee15657..05cc642417a 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -405,6 +405,16 @@ else()
     )
 endif()
 
+if(DEFINED ENV{ARROW_BZIP2_URL})
+  set(ARROW_BZIP2_SOURCE_URL "$ENV{ARROW_BZIP2_URL}")
+else()
+  set_urls(
+    ARROW_BZIP2_SOURCE_URL
+    "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz"
+    "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz"
+    )
+endif()
+
 if(DEFINED ENV{ARROW_CARES_URL})
   set(CARES_SOURCE_URL "$ENV{ARROW_CARES_URL}")
 else()
@@ -571,6 +581,15 @@ else()
     )
 endif()
 
+if(DEFINED ENV{ARROW_UTF8PROC_URL})
+  set(ARROW_UTF8PROC_SOURCE_URL "$ENV{ARROW_UTF8PROC_URL}")
+else()
+  set_urls(
+    ARROW_UTF8PROC_SOURCE_URL
+    "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz"
+    )
+endif()
+
 if(DEFINED ENV{ARROW_XSIMD_URL})
   set(XSIMD_SOURCE_URL "$ENV{ARROW_XSIMD_URL}")
 else()
@@ -598,30 +617,15 @@ else()
     )
 endif()
 
-if(DEFINED ENV{ARROW_BZIP2_SOURCE_URL})
-  set(ARROW_BZIP2_SOURCE_URL "$ENV{ARROW_BZIP2_SOURCE_URL}")
-else()
-  set_urls(
-    ARROW_BZIP2_SOURCE_URL
-    "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz"
-    "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz"
-    )
-endif()
-
-if(DEFINED ENV{ARROW_UTF8PROC_SOURCE_URL})
-  set(ARROW_UTF8PROC_SOURCE_URL "$ENV{ARROW_UTF8PROC_SOURCE_URL}")
-else()
-  set_urls(
-    ARROW_UTF8PROC_SOURCE_URL
-    "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz"
-    )
-endif()
-
 # ----------------------------------------------------------------------
 # ExternalProject options
 
-set(EP_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}")
-set(EP_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}")
+set(
+  EP_CXX_FLAGS
+  "${CMAKE_CXX_COMPILER_ARG1} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}"
+  )
+set(EP_C_FLAGS
+    "${CMAKE_C_COMPILER_ARG1} ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}")
 
 if(NOT MSVC_TOOLCHAIN)
   # Set -fPIC on all external projects
@@ -1935,7 +1939,7 @@ macro(build_xsimd)
 endmacro()
 
 # For now xsimd is always bundled from upstream
-if(1)
+if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE")
   set(xsimd_SOURCE "BUNDLED")
   resolve_dependency(xsimd)
   # TODO: Don't use global includes but rather target_include_directories
diff --git a/cpp/cmake_modules/Usevcpkg.cmake b/cpp/cmake_modules/Usevcpkg.cmake
index 118d850909f..781bec436f3 100644
--- a/cpp/cmake_modules/Usevcpkg.cmake
+++ b/cpp/cmake_modules/Usevcpkg.cmake
@@ -57,7 +57,7 @@ else()
     endif()
   elseif(DEFINED ENV{VCPKG_ROOT})
     # Get it from the environment variable VCPKG_ROOT
-    set(VCPKG_ROOT ENV{VCPKG_ROOT})
+    set(VCPKG_ROOT $ENV{VCPKG_ROOT})
     find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH)
     if(NOT _VCPKG_BIN)
       message(
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 04756aaf8e9..df72dcc5b6b 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -373,6 +373,7 @@ if(ARROW_COMPUTE)
               compute/kernels/aggregate_tdigest.cc
               compute/kernels/aggregate_var_std.cc
               compute/kernels/codegen_internal.cc
+              compute/kernels/hash_aggregate.cc
               compute/kernels/scalar_arithmetic.cc
               compute/kernels/scalar_boolean.cc
               compute/kernels/scalar_cast_boolean.cc
diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h
index d3ae93318ba..db3c640b9a4 100644
--- a/cpp/src/arrow/array/array_binary.h
+++ b/cpp/src/arrow/array/array_binary.h
@@ -117,13 +117,13 @@ class BaseBinaryArray : public FlatArray {
     }
   }
 
-  IteratorType begin() { return IteratorType(*this); }
+  IteratorType begin() const { return IteratorType(*this); }
 
-  IteratorType end() { return IteratorType(*this, length()); }
+  IteratorType end() const { return IteratorType(*this, length()); }
 
  protected:
   // For subclasses
-  BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {}
+  BaseBinaryArray() = default;
 
   // Protected method for constructors
   void SetData(const std::shared_ptr<ArrayData>& data) {
@@ -132,8 +132,8 @@ class BaseBinaryArray : public FlatArray {
     raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
   }
 
-  const offset_type* raw_value_offsets_;
-  const uint8_t* raw_data_;
+  const offset_type* raw_value_offsets_ = NULLPTR;
+  const uint8_t* raw_data_ = NULLPTR;
 };
 
 /// Concrete Array class for variable-size binary data
@@ -231,9 +231,9 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
 
   const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
 
-  IteratorType begin() { return IteratorType(*this); }
+  IteratorType begin() const { return IteratorType(*this); }
 
-  IteratorType end() { return IteratorType(*this, length()); }
+  IteratorType end() const { return IteratorType(*this, length()); }
 
  protected:
   void SetData(const std::shared_ptr<ArrayData>& data) {
diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc
index 1696653850b..a50cbcc13cf 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -20,6 +20,7 @@
 #include <memory>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
@@ -197,10 +198,11 @@ class TestListArray : public TestBuilder {
   }
 
   void TestFromArrays() {
-    std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, values;
+    std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, offsets5, values;
 
     std::vector<bool> offsets_is_valid3 = {true, false, true, true};
     std::vector<bool> offsets_is_valid4 = {true, true, false, true};
+    std::vector<bool> offsets_is_valid5 = {true, true, false, false};
 
     std::vector<bool> values_is_valid = {true, false, true, true, true, true};
 
@@ -217,6 +219,8 @@ class TestListArray : public TestBuilder {
                                              &offsets3);
     ArrayFromVector<OffsetType, offset_type>(offsets_is_valid4, offset2_values,
                                              &offsets4);
+    ArrayFromVector<OffsetType, offset_type>(offsets_is_valid5, offset2_values,
+                                             &offsets5);
 
     ArrayFromVector<Int8Type, int8_t>(values_is_valid, values_values, &values);
 
@@ -254,6 +258,28 @@ class TestListArray : public TestBuilder {
 
     // Offsets not the right type
     ASSERT_RAISES(TypeError, ArrayType::FromArrays(*values, *offsets1, pool_));
+
+    // Null final offset
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::HasSubstr("Last list offset should be non-null"),
+        ArrayType::FromArrays(*offsets5, *values, pool_));
+
+    // ARROW-12077: check for off-by-one in construction (need mimalloc/ASan/Valgrind)
+    {
+      std::shared_ptr<Array> offsets, values;
+      // Length multiple of 8 - we'll allocate a validity buffer with exactly enough bits
+      // (Need a large enough buffer or else ASan doesn't catch it)
+      std::vector<bool> offsets_is_valid(4096);
+      std::vector<offset_type> offset_values(4096);
+      std::vector<int8_t> values_values(4096);
+      std::fill(offsets_is_valid.begin(), offsets_is_valid.end(), true);
+      offsets_is_valid[1] = false;
+      std::fill(offset_values.begin(), offset_values.end(), 0);
+      std::fill(values_values.begin(), values_values.end(), 0);
+      ArrayFromVector<OffsetType, offset_type>(offsets_is_valid, offset_values, &offsets);
+      ArrayFromVector<Int8Type, int8_t>(values_values, &values);
+      ASSERT_OK_AND_ASSIGN(auto list, ArrayType::FromArrays(*offsets, *values, pool_));
+    }
   }
 
   void TestAppendNull() {
diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 97bbb18696c..f967127c5f1 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -70,12 +70,11 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool,
     ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
                           AllocateBuffer(num_offsets * sizeof(offset_type), pool));
 
-    // Copy valid bits, zero out the bit for the final offset
-    // XXX why?
+    // Copy valid bits, ignoring the final offset (since for a length N list array,
+    // we have N + 1 offsets)
     ARROW_ASSIGN_OR_RAISE(
         auto clean_valid_bits,
         offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1)));
-    BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets);
     *validity_buf_out = clean_valid_bits;
 
     const offset_type* raw_offsets = typed_offsets.raw_values();
diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h
index f9ac60f6cb9..b601eb770c3 100644
--- a/cpp/src/arrow/array/array_primitive.h
+++ b/cpp/src/arrow/array/array_primitive.h
@@ -64,9 +64,9 @@ class NumericArray : public PrimitiveArray {
   // For API compatibility with BinaryArray etc.
   value_type GetView(int64_t i) const { return Value(i); }
 
-  IteratorType begin() { return IteratorType(*this); }
+  IteratorType begin() const { return IteratorType(*this); }
 
-  IteratorType end() { return IteratorType(*this, length()); }
+  IteratorType end() const { return IteratorType(*this, length()); }
 
  protected:
   using PrimitiveArray::PrimitiveArray;
@@ -99,9 +99,9 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
   /// values. Result is not cached.
   int64_t true_count() const;
 
-  IteratorType begin() { return IteratorType(*this); }
+  IteratorType begin() const { return IteratorType(*this); }
 
-  IteratorType end() { return IteratorType(*this, length()); }
+  IteratorType end() const { return IteratorType(*this, length()); }
 
  protected:
   using PrimitiveArray::PrimitiveArray;
diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc
index 1eb722b13c5..88d25e823bb 100644
--- a/cpp/src/arrow/array/array_union_test.cc
+++ b/cpp/src/arrow/array/array_union_test.cc
@@ -152,7 +152,8 @@ class TestUnionArrayFactories : public ::testing::Test {
 
 TEST_F(TestUnionArrayFactories, TestMakeDense) {
   std::shared_ptr<Array> value_offsets;
-  ArrayFromVector<Int32Type, int32_t>({1, 0, 0, 0, 1, 0, 1, 2, 1, 2}, &value_offsets);
+  // type_ids_:                       {0, 1, 2, 0, 1, 3, 2, 0, 2, 1}
+  ArrayFromVector<Int32Type, int32_t>({0, 0, 0, 1, 1, 0, 1, 2, 1, 2}, &value_offsets);
 
   auto children = std::vector<std::shared_ptr<Array>>(4);
   ArrayFromVector<StringType, std::string>({"abc", "def", "xyz"}, &children[0]);
@@ -208,12 +209,19 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) {
   ASSERT_RAISES(Invalid, result->ValidateFull());
 
   // Invalid offsets
+  // - offset out of bounds at index 5
   std::shared_ptr<Array> invalid_offsets;
-  ArrayFromVector<Int32Type, int32_t>({1, 0, 0, 0, 1, 1, 1, 2, 1, 2}, &invalid_offsets);
+  ArrayFromVector<Int32Type, int32_t>({0, 0, 0, 1, 1, 1, 1, 2, 1, 2}, &invalid_offsets);
   ASSERT_OK_AND_ASSIGN(result,
                        DenseUnionArray::Make(*type_ids_, *invalid_offsets, children));
   ASSERT_RAISES(Invalid, result->ValidateFull());
-  ArrayFromVector<Int32Type, int32_t>({1, 0, 0, 0, 1, -1, 1, 2, 1, 2}, &invalid_offsets);
+  // - negative offset at index 5
+  ArrayFromVector<Int32Type, int32_t>({0, 0, 0, 1, 1, -1, 1, 2, 1, 2}, &invalid_offsets);
+  ASSERT_OK_AND_ASSIGN(result,
+                       DenseUnionArray::Make(*type_ids_, *invalid_offsets, children));
+  ASSERT_RAISES(Invalid, result->ValidateFull());
+  // - non-monotonic offset at index 3
+  ArrayFromVector<Int32Type, int32_t>({1, 0, 0, 0, 1, 0, 1, 2, 1, 2}, &invalid_offsets);
   ASSERT_OK_AND_ASSIGN(result,
                        DenseUnionArray::Make(*type_ids_, *invalid_offsets, children));
   ASSERT_RAISES(Invalid, result->ValidateFull());
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 2b8665fe2e6..6ac885f8443 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -527,6 +527,7 @@ struct ValidateArrayFullImpl {
       }
 
       // Check offsets are in bounds
+      std::vector<int64_t> last_child_offsets(256, 0);
       const int32_t* offsets = data.GetValues<int32_t>(2);
       for (int64_t i = 0; i < data.length; ++i) {
         const int32_t code = type_codes[i];
@@ -541,6 +542,11 @@ struct ValidateArrayFullImpl {
                                  "than child length (",
                                  offset, " >= ", child_lengths[code], ")");
         }
+        if (offset < last_child_offsets[code]) {
+          return Status::Invalid("Union value at position ", i,
+                                 " has non-monotonic offset ", offset);
+        }
+        last_child_offsets[code] = offset;
       }
     }
 
diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h
index 41a47c91729..f525ec23c58 100644
--- a/cpp/src/arrow/buffer_builder.h
+++ b/cpp/src/arrow/buffer_builder.h
@@ -162,6 +162,12 @@ class ARROW_EXPORT BufferBuilder {
     return Status::OK();
   }
 
+  Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+    std::shared_ptr<Buffer> out;
+    ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+    return out;
+  }
+
   void Reset() {
     buffer_ = NULLPTR;
     capacity_ = size_ = 0;
@@ -202,6 +208,11 @@ class TypedBufferBuilder<
                               MemoryPool* pool = default_memory_pool())
       : bytes_builder_(std::move(buffer), pool) {}
 
+  explicit TypedBufferBuilder(BufferBuilder builder)
+      : bytes_builder_(std::move(builder)) {}
+
+  BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
   Status Append(T value) {
     return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
   }
@@ -256,6 +267,12 @@ class TypedBufferBuilder<
     return bytes_builder_.Finish(out, shrink_to_fit);
   }
 
+  Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+    std::shared_ptr<Buffer> out;
+    ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+    return out;
+  }
+
   void Reset() { bytes_builder_.Reset(); }
 
   int64_t length() const { return bytes_builder_.length() / sizeof(T); }
@@ -274,6 +291,11 @@ class TypedBufferBuilder<bool> {
   explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool())
       : bytes_builder_(pool) {}
 
+  explicit TypedBufferBuilder(BufferBuilder builder)
+      : bytes_builder_(std::move(builder)) {}
+
+  BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
   Status Append(bool value) {
     ARROW_RETURN_NOT_OK(Reserve(1));
     UnsafeAppend(value);
@@ -371,6 +393,12 @@ class TypedBufferBuilder<bool> {
     return bytes_builder_.Finish(out, shrink_to_fit);
   }
 
+  Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+    std::shared_ptr<Buffer> out;
+    ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+    return out;
+  }
+
   void Reset() {
     bytes_builder_.Reset();
     bit_length_ = false_count_ = 0;
diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h
index 387105de9e7..6769b23867b 100644
--- a/cpp/src/arrow/compare.h
+++ b/cpp/src/arrow/compare.h
@@ -71,7 +71,7 @@ class EqualOptions {
     return res;
   }
 
-  static EqualOptions Defaults() { return EqualOptions(); }
+  static EqualOptions Defaults() { return {}; }
 
  protected:
   double atol_ = kDefaultAbsoluteTolerance;
diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h
index eef1587bb73..ca118ec5678 100644
--- a/cpp/src/arrow/compute/api_aggregate.h
+++ b/cpp/src/arrow/compute/api_aggregate.h
@@ -306,5 +306,102 @@ Result<Datum> TDigest(const Datum& value,
                       const TDigestOptions& options = TDigestOptions::Defaults(),
                       ExecContext* ctx = NULLPTR);
 
+namespace internal {
+
+/// Internal use only: streaming group identifier.
+/// Consumes batches of keys and yields batches of the group ids.
+class ARROW_EXPORT Grouper {
+ public:
+  virtual ~Grouper() = default;
+
+  /// Construct a Grouper which receives the specified key types
+  static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
+                                               ExecContext* ctx = default_exec_context());
+
+  /// Consume a batch of keys, producing the corresponding group ids as an integer array.
+  /// Currently only uint32 indices will be produced, eventually the bit width will only
+  /// be as wide as necessary.
+  virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
+
+  /// Get current unique keys. May be called multiple times.
+  virtual Result<ExecBatch> GetUniques() = 0;
+
+  /// Get the current number of groups.
+  virtual uint32_t num_groups() const = 0;
+
+  /// \brief Assemble lists of indices of identical elements.
+  ///
+  /// \param[in] ids An unsigned, all-valid integral array which will be
+  ///                used as grouping criteria.
+  /// \param[in] num_groups An upper bound for the elements of ids
+  /// \return A num_groups-long ListArray where the slot at i contains a
+  ///         list of indices where i appears in ids.
+  ///
+  ///   MakeGroupings([
+  ///       2,
+  ///       2,
+  ///       5,
+  ///       5,
+  ///       2,
+  ///       3
+  ///   ], 8) == [
+  ///       [],
+  ///       [],
+  ///       [0, 1, 4],
+  ///       [5],
+  ///       [],
+  ///       [2, 3],
+  ///       [],
+  ///       []
+  ///   ]
+  static Result<std::shared_ptr<ListArray>> MakeGroupings(
+      const UInt32Array& ids, uint32_t num_groups,
+      ExecContext* ctx = default_exec_context());
+
+  /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
+  /// the provided groupings.
+  ///
+  /// For example,
+  ///   ApplyGroupings([
+  ///       [],
+  ///       [],
+  ///       [0, 1, 4],
+  ///       [5],
+  ///       [],
+  ///       [2, 3],
+  ///       [],
+  ///       []
+  ///   ], [2, 2, 5, 5, 2, 3]) == [
+  ///       [],
+  ///       [],
+  ///       [2, 2, 2],
+  ///       [3],
+  ///       [],
+  ///       [5, 5],
+  ///       [],
+  ///       []
+  ///   ]
+  static Result<std::shared_ptr<ListArray>> ApplyGroupings(
+      const ListArray& groupings, const Array& array,
+      ExecContext* ctx = default_exec_context());
+};
+
+/// \brief Configure a grouped aggregation
+struct ARROW_EXPORT Aggregate {
+  /// the name of the aggregation function
+  std::string function;
+
+  /// options for the aggregation function
+  const FunctionOptions* options;
+};
+
+/// Internal use only: helper function for testing HashAggregateKernels.
+/// This will be replaced by streaming execution operators.
+ARROW_EXPORT
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+                      const std::vector<Aggregate>& aggregates,
+                      ExecContext* ctx = default_exec_context());
+
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 0d95092c95b..730836bd118 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -68,6 +68,19 @@ struct ARROW_EXPORT SplitPatternOptions : public SplitOptions {
   std::string pattern;
 };
 
+struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+  explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
+                                   int64_t max_replacements = -1)
+      : pattern(pattern), replacement(replacement), max_replacements(max_replacements) {}
+
+  /// Pattern to match, literal, or regular expression depending on which kernel is used
+  std::string pattern;
+  /// String to replace the pattern with
+  std::string replacement;
+  /// Max number of substrings to replace (-1 means unbounded)
+  int64_t max_replacements;
+};
+
 /// Options for IsIn and IndexIn functions
 struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
   explicit SetLookupOptions(Datum value_set, bool skip_nulls = false)
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 6443c96e918..c3187a3995a 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -36,6 +36,7 @@
 #include "arrow/compute/registry.h"
 #include "arrow/compute/util_internal.h"
 #include "arrow/datum.h"
+#include "arrow/record_batch.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -57,6 +58,44 @@ using internal::CpuInfo;
 
 namespace compute {
 
+ExecContext* default_exec_context() {
+  static ExecContext default_ctx;
+  return &default_ctx;
+}
+
+ExecBatch::ExecBatch(const RecordBatch& batch)
+    : values(batch.num_columns()), length(batch.num_rows()) {
+  auto columns = batch.column_data();
+  std::move(columns.begin(), columns.end(), values.begin());
+}
+
+Result<ExecBatch> ExecBatch::Make(std::vector<Datum> values) {
+  if (values.empty()) {
+    return Status::Invalid("Cannot infer ExecBatch length without at least one value");
+  }
+
+  int64_t length = -1;
+  for (const auto& value : values) {
+    if (value.is_scalar()) {
+      if (length == -1) {
+        length = 1;
+      }
+      continue;
+    }
+
+    if (length == -1) {
+      length = value.length();
+      continue;
+    }
+
+    if (length != value.length()) {
+      return Status::Invalid(
+          "Arrays used to construct an ExecBatch must have equal length");
+    }
+  }
+
+  return ExecBatch(std::move(values), length);
+}
 namespace {
 
 Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t length,
@@ -838,6 +877,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
 
  private:
   Status Consume(const ExecBatch& batch) {
+    // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
     auto batch_state = kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_});
     ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_);
 
@@ -855,6 +895,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
 
     kernel_->merge(kernel_ctx_, std::move(*batch_state), state());
     ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_);
+
     return Status::OK();
   }
 
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index f491489ed8a..7659442d8bf 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -119,6 +119,8 @@ class ARROW_EXPORT ExecContext {
   bool use_threads_ = true;
 };
 
+ARROW_EXPORT ExecContext* default_exec_context();
+
 // TODO: Consider standardizing on uint16 selection vectors and only use them
 // when we can ensure that each value is 64K length or smaller
 
@@ -164,11 +166,15 @@ class ARROW_EXPORT SelectionVector {
 /// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
 /// than is desirable for this class. Microbenchmarks would help determine for
 /// sure. See ARROW-8928.
-struct ExecBatch {
+struct ARROW_EXPORT ExecBatch {
   ExecBatch() = default;
   ExecBatch(std::vector<Datum> values, int64_t length)
       : values(std::move(values)), length(length) {}
 
+  explicit ExecBatch(const RecordBatch& batch);
+
+  static Result<ExecBatch> Make(std::vector<Datum> values);
+
   /// The values representing positional arguments to be passed to a kernel's
   /// exec function for processing.
   std::vector<Datum> values;
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index a74e5c8d8fa..55daa243cd3 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -106,6 +106,11 @@ class ARROW_EXPORT KernelExecutor {
  public:
   virtual ~KernelExecutor() = default;
 
+  /// The Kernel's `init` method must be called and any KernelState set in the
+  /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate
+  /// the case where init may be expensive and does not need to be called again for
+  /// each execution of the kernel, for example the same lookup table can be re-used
+  /// for all scanned batches in a dataset filter.
   virtual Status Init(KernelContext*, KernelInitArgs) = 0;
 
   /// XXX: Better configurability for listener
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index 70d7d998e9c..c8fc8b8dec0 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -126,6 +126,11 @@ const Kernel* DispatchExactImpl(const Function* func,
         checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
   }
 
+  if (func->kind() == Function::HASH_AGGREGATE) {
+    return DispatchExactImpl(checked_cast<const HashAggregateFunction*>(func)->kernels(),
+                             values);
+  }
+
   return nullptr;
 }
 
@@ -184,8 +189,10 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
     executor = detail::KernelExecutor::MakeScalar();
   } else if (kind() == Function::VECTOR) {
     executor = detail::KernelExecutor::MakeVector();
-  } else {
+  } else if (kind() == Function::SCALAR_AGGREGATE) {
     executor = detail::KernelExecutor::MakeScalarAggregate();
+  } else {
+    return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
   }
   RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options}));
 
@@ -263,6 +270,15 @@ Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) {
   return Status::OK();
 }
 
+Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) {
+  RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+  if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+    return Status::Invalid("Function accepts varargs but kernel signature does not");
+  }
+  kernels_.emplace_back(std::move(kernel));
+  return Status::OK();
+}
+
 Result<Datum> MetaFunction::Execute(const std::vector<Datum>& args,
                                     const FunctionOptions* options,
                                     ExecContext* ctx) const {
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index af5d81a30ec..9a3e1c1852f 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -133,6 +133,10 @@ class ARROW_EXPORT Function {
     /// A function that computes scalar summary statistics from array input.
     SCALAR_AGGREGATE,
 
+    /// A function that computes grouped summary statistics from array input
+    /// and an array of group identifiers.
+    HASH_AGGREGATE,
+
     /// A function that dispatches to other functions and does not contain its
     /// own kernels.
     META
@@ -307,6 +311,21 @@ class ARROW_EXPORT ScalarAggregateFunction
   Status AddKernel(ScalarAggregateKernel kernel);
 };
 
+class ARROW_EXPORT HashAggregateFunction
+    : public detail::FunctionImpl<HashAggregateKernel> {
+ public:
+  using KernelType = HashAggregateKernel;
+
+  HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+                        const FunctionOptions* default_options = NULLPTR)
+      : detail::FunctionImpl<HashAggregateKernel>(
+            std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {}
+
+  /// \brief Add a kernel (function implementation). Returns error if the
+  /// kernel's signature does not match the function's arity.
+  Status AddKernel(HashAggregateKernel kernel);
+};
+
 /// \brief A function that dispatches to other functions. Must implement
 /// MetaFunction::ExecuteImpl.
 ///
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index c8f9cacfb34..b99b41170d2 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -537,7 +537,8 @@ struct Kernel {
       : signature(std::move(sig)), init(std::move(init)) {}
 
   Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
-      : Kernel(KernelSignature::Make(std::move(in_types), out_type), std::move(init)) {}
+      : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
+               std::move(init)) {}
 
   /// \brief The "signature" of the kernel containing the InputType input
   /// argument validators and OutputType output type and shape resolver.
@@ -574,7 +575,8 @@ struct ArrayKernel : public Kernel {
 
   ArrayKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
               KernelInit init = NULLPTR)
-      : Kernel(std::move(in_types), std::move(out_type), init), exec(std::move(exec)) {}
+      : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
+        exec(std::move(exec)) {}
 
   /// \brief Perform a single invocation of this kernel. Depending on the
   /// implementation, it may only write into preallocated memory, while in some
@@ -617,7 +619,7 @@ struct VectorKernel : public ArrayKernel {
   VectorKernel() = default;
 
   VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec)
-      : ArrayKernel(std::move(sig), exec) {}
+      : ArrayKernel(std::move(sig), std::move(exec)) {}
 
   VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
                KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
@@ -680,12 +682,12 @@ using ScalarAggregateFinalize = std::function<void(KernelContext*, Datum*)>;
 /// * finalize: produces the end result of the aggregation using the
 ///   KernelState in the KernelContext.
 struct ScalarAggregateKernel : public Kernel {
-  ScalarAggregateKernel() {}
+  ScalarAggregateKernel() = default;
 
   ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
                         ScalarAggregateConsume consume, ScalarAggregateMerge merge,
                         ScalarAggregateFinalize finalize)
-      : Kernel(std::move(sig), init),
+      : Kernel(std::move(sig), std::move(init)),
         consume(std::move(consume)),
         merge(std::move(merge)),
         finalize(std::move(finalize)) {}
@@ -693,13 +695,59 @@ struct ScalarAggregateKernel : public Kernel {
   ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
                         KernelInit init, ScalarAggregateConsume consume,
                         ScalarAggregateMerge merge, ScalarAggregateFinalize finalize)
-      : ScalarAggregateKernel(KernelSignature::Make(std::move(in_types), out_type), init,
-                              consume, merge, finalize) {}
+      : ScalarAggregateKernel(
+            KernelSignature::Make(std::move(in_types), std::move(out_type)),
+            std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
 
   ScalarAggregateConsume consume;
   ScalarAggregateMerge merge;
   ScalarAggregateFinalize finalize;
 };
 
+// ----------------------------------------------------------------------
+// HashAggregateKernel (for HashAggregateFunction)
+
+using HashAggregateConsume = std::function<void(KernelContext*, const ExecBatch&)>;
+
+using HashAggregateMerge =
+    std::function<void(KernelContext*, KernelState&&, KernelState*)>;
+
+// Finalize returns Datum to permit multiple return values
+using HashAggregateFinalize = std::function<void(KernelContext*, Datum*)>;
+
+/// \brief Kernel data structure for implementations of
+/// HashAggregateFunction. The four necessary components of an aggregation
+/// kernel are the init, consume, merge, and finalize functions.
+///
+/// * init: creates a new KernelState for a kernel.
+/// * consume: processes an ExecBatch (which includes the argument as well
+///   as an array of group identifiers) and updates the KernelState found in the
+///   KernelContext.
+/// * merge: combines one KernelState with another.
+/// * finalize: produces the end result of the aggregation using the
+///   KernelState in the KernelContext.
+struct HashAggregateKernel : public Kernel {
+  HashAggregateKernel() = default;
+
+  HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+                      HashAggregateConsume consume, HashAggregateMerge merge,
+                      HashAggregateFinalize finalize)
+      : Kernel(std::move(sig), std::move(init)),
+        consume(std::move(consume)),
+        merge(std::move(merge)),
+        finalize(std::move(finalize)) {}
+
+  HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+                      KernelInit init, HashAggregateMerge merge,
+                      HashAggregateConsume consume, HashAggregateFinalize finalize)
+      : HashAggregateKernel(
+            KernelSignature::Make(std::move(in_types), std::move(out_type)),
+            std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
+
+  HashAggregateConsume consume;
+  HashAggregateMerge merge;
+  HashAggregateFinalize finalize;
+};
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 577b250da87..5e223a1f906 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -59,5 +59,9 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute")
 
 # Aggregates
 
-add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc)
+add_arrow_compute_test(aggregate_test
+                       SOURCES
+                       aggregate_test.cc
+                       hash_aggregate_test.cc
+                       test_util.cc)
 add_arrow_benchmark(aggregate_benchmark PREFIX "arrow-compute")
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 5cdd3bd1dd1..61dc8cb403c 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -250,15 +250,13 @@ const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numer
                               {"array"},
                               "MinMaxOptions"};
 
-const FunctionDoc any_doc{
-    "Test whether any element in a boolean array evaluates to true.",
-    ("Null values are ignored."),
-    {"array"}};
-
-const FunctionDoc all_doc{
-    "Test whether all elements in a boolean array evaluate to true.",
-    ("Null values are ignored."),
-    {"array"}};
+const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true",
+                          ("Null values are ignored."),
+                          {"array"}};
+
+const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true",
+                          ("Null values are ignored."),
+                          {"array"}};
 
 }  // namespace
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
index c90dd03c06e..42be0c36544 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
@@ -300,6 +300,169 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll<int64_t>)
     ->Apply(BenchmarkSetArgs);
 #endif  // ARROW_WITH_BENCHMARKS_REFERENCE
 
+//
+// GroupBy
+//
+
+static void BenchmarkGroupBy(benchmark::State& state,
+                             std::vector<internal::Aggregate> aggregates,
+                             std::vector<Datum> arguments, std::vector<Datum> keys) {
+  for (auto _ : state) {
+    ABORT_NOT_OK(GroupBy(arguments, keys, aggregates).status());
+  }
+}
+
+#define GROUP_BY_BENCHMARK(Name, Impl)                               \
+  static void Name(benchmark::State& state) {                        \
+    RegressionArgs args(state, false);                               \
+    auto rng = random::RandomArrayGenerator(1923);                   \
+    (Impl)();                                                        \
+  }                                                                  \
+  BENCHMARK(Name)->Apply([](benchmark::internal::Benchmark* bench) { \
+    BenchmarkSetArgsWithSizes(bench, {1 * 1024 * 1024});             \
+  })
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyStringSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto key = rng.StringWithRepeats(args.size,
+                                   /*unique=*/16,
+                                   /*min_length=*/3,
+                                   /*max_length=*/32);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallStringSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto key = rng.StringWithRepeats(args.size,
+                                   /*unique=*/256,
+                                   /*min_length=*/3,
+                                   /*max_length=*/32);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumStringSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto key = rng.StringWithRepeats(args.size,
+                                   /*unique=*/4096,
+                                   /*min_length=*/3,
+                                   /*max_length=*/32);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntegerSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto key = rng.Int64(args.size,
+                       /*min=*/0,
+                       /*max=*/15);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntegerSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto key = rng.Int64(args.size,
+                       /*min=*/0,
+                       /*max=*/255);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntegerSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto key = rng.Int64(args.size,
+                       /*min=*/0,
+                       /*max=*/4095);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntStringPairSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto int_key = rng.Int64(args.size,
+                           /*min=*/0,
+                           /*max=*/4);
+  auto str_key = rng.StringWithRepeats(args.size,
+                                       /*unique=*/4,
+                                       /*min_length=*/3,
+                                       /*max_length=*/32);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntStringPairSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto int_key = rng.Int64(args.size,
+                           /*min=*/0,
+                           /*max=*/15);
+  auto str_key = rng.StringWithRepeats(args.size,
+                                       /*unique=*/16,
+                                       /*min_length=*/3,
+                                       /*max_length=*/32);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key});
+});
+
+GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] {
+  auto summand = rng.Float64(args.size,
+                             /*min=*/0.0,
+                             /*max=*/1.0e14,
+                             /*null_probability=*/args.null_proportion,
+                             /*nan_probability=*/args.null_proportion / 10);
+
+  auto int_key = rng.Int64(args.size,
+                           /*min=*/0,
+                           /*max=*/63);
+  auto str_key = rng.StringWithRepeats(args.size,
+                                       /*unique=*/64,
+                                       /*min_length=*/3,
+                                       /*max_length=*/32);
+
+  BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key});
+});
+
 //
 // Sum
 //
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 569886a1351..22e7f512e97 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -27,19 +27,26 @@
 #include "arrow/array.h"
 #include "arrow/chunked_array.h"
 #include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/cast.h"
 #include "arrow/compute/kernels/aggregate_internal.h"
 #include "arrow/compute/kernels/test_util.h"
+#include "arrow/compute/registry.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bitmap_reader.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/int_util_internal.h"
 
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
+#include "arrow/util/logging.h"
 
 namespace arrow {
 
+using internal::BitmapReader;
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
@@ -65,8 +72,7 @@ static SumResult<ArrowType> NaiveSumPartial(const Array& array) {
   const auto values = array_numeric.raw_values();
 
   if (array.null_count() != 0) {
-    internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
-                                  array.length());
+    BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
     for (int64_t i = 0; i < array.length(); i++) {
       if (reader.IsSet()) {
         result.first += values[i];
@@ -488,9 +494,7 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test {
 
   void AssertMinMaxIsNull(const Datum& array, const MinMaxOptions& options) {
     ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options));
-
-    const StructScalar& value = out.scalar_as<StructScalar>();
-    for (const auto& val : value.value) {
+    for (const auto& val : out.scalar_as<StructScalar>().value) {
       ASSERT_FALSE(val->is_valid);
     }
   }
@@ -646,8 +650,7 @@ static enable_if_integer<ArrowType, MinMaxResult<ArrowType>> NaiveMinMax(
   T min = std::numeric_limits<T>::max();
   T max = std::numeric_limits<T>::min();
   if (array.null_count() != 0) {  // Some values are null
-    internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
-                                  array.length());
+    BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
     for (int64_t i = 0; i < array.length(); i++) {
       if (reader.IsSet()) {
         min = std::min(min, values[i]);
@@ -686,8 +689,7 @@ static enable_if_floating_point<ArrowType, MinMaxResult<ArrowType>> NaiveMinMax(
   T min = std::numeric_limits<T>::infinity();
   T max = -std::numeric_limits<T>::infinity();
   if (array.null_count() != 0) {  // Some values are null
-    internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
-                                  array.length());
+    BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
     for (int64_t i = 0; i < array.length(); i++) {
       if (reader.IsSet()) {
         min = std::fmin(min, values[i]);
@@ -1030,7 +1032,7 @@ ModeResult<ArrowType> NaiveMode(const Array& array) {
 
   const auto& array_numeric = reinterpret_cast<const ArrayType&>(array);
   const auto values = array_numeric.raw_values();
-  internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
+  BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
   for (int64_t i = 0; i < array.length(); ++i) {
     if (reader.IsSet()) {
       ++value_counts[values[i]];
@@ -1281,7 +1283,7 @@ void KahanSum(double& sum, double& adjust, double addend) {
 template <typename ArrayType>
 std::pair<double, double> WelfordVar(const ArrayType& array) {
   const auto values = array.raw_values();
-  internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
+  BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
   double count = 0, mean = 0, m2 = 0;
   double mean_adjust = 0, m2_adjust = 0;
   for (int64_t i = 0; i < array.length(); ++i) {
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 9e2ed82a426..b5d6c3807f1 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -328,6 +328,13 @@ struct UnboxScalar<Decimal128Type> {
   }
 };
 
+template <>
+struct UnboxScalar<Decimal256Type> {
+  static Decimal256 Unbox(const Scalar& val) {
+    return checked_cast<const Decimal256Scalar&>(val).value;
+  }
+};
+
 template <typename Type, typename Enable = void>
 struct BoxScalar;
 
@@ -354,6 +361,13 @@ struct BoxScalar<Decimal128Type> {
   static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
 };
 
+template <>
+struct BoxScalar<Decimal256Type> {
+  using T = Decimal256;
+  using ScalarType = Decimal256Scalar;
+  static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
+};
+
 // A VisitArrayDataInline variant that calls its visitor function with logical
 // values, such as Decimal128 rather than util::string_view.
 
@@ -675,12 +689,13 @@ struct ScalarUnaryNotNullStateful {
   };
 
   template <typename Type>
-  struct ArrayExec<Type, enable_if_t<std::is_same<Type, Decimal128Type>::value>> {
+  struct ArrayExec<Type, enable_if_decimal<Type>> {
     static void Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
                      Datum* out) {
       ArrayData* out_arr = out->mutable_array();
       // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian
-      using endian_agnostic = std::array<uint8_t, sizeof(Decimal128)>;
+      using endian_agnostic =
+          std::array<uint8_t, sizeof(typename TypeTraits<Type>::ScalarType::ValueType)>;
       auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
       VisitArrayValuesInline<Arg0Type>(
           arg0,
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
new file mode 100644
index 00000000000..d9750cb4760
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -0,0 +1,1057 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_aggregate.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+namespace internal {
+namespace {
+
+struct KeyEncoder {
+  // the first byte of an encoded key is used to indicate nullity
+  static constexpr bool kExtraByteForNull = true;
+
+  static constexpr uint8_t kNullByte = 1;
+  static constexpr uint8_t kValidByte = 0;
+
+  virtual ~KeyEncoder() = default;
+
+  virtual void AddLength(const ArrayData&, int32_t* lengths) = 0;
+
+  virtual Status Encode(const ArrayData&, uint8_t** encoded_bytes) = 0;
+
+  virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
+                                                    int32_t length, MemoryPool*) = 0;
+
+  // extract the null bitmap from the leading nullity bytes of encoded keys
+  static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes,
+                            std::shared_ptr<Buffer>* null_bitmap, int32_t* null_count) {
+    // first count nulls to determine if a null bitmap is necessary
+    *null_count = 0;
+    for (int32_t i = 0; i < length; ++i) {
+      *null_count += (encoded_bytes[i][0] == kNullByte);
+    }
+
+    if (*null_count > 0) {
+      ARROW_ASSIGN_OR_RAISE(*null_bitmap, AllocateBitmap(length, pool));
+
+      uint8_t* validity = (*null_bitmap)->mutable_data();
+      for (int32_t i = 0; i < length; ++i) {
+        BitUtil::SetBitTo(validity, i, encoded_bytes[i][0] == kValidByte);
+        encoded_bytes[i] += 1;
+      }
+    } else {
+      for (int32_t i = 0; i < length; ++i) {
+        encoded_bytes[i] += 1;
+      }
+    }
+    return Status ::OK();
+  }
+};
+
+struct BooleanKeyEncoder : KeyEncoder {
+  static constexpr int kByteWidth = 1;
+
+  void AddLength(const ArrayData& data, int32_t* lengths) override {
+    for (int64_t i = 0; i < data.length; ++i) {
+      lengths[i] += kByteWidth + kExtraByteForNull;
+    }
+  }
+
+  Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+    VisitArrayDataInline<BooleanType>(
+        data,
+        [&](bool value) {
+          auto& encoded_ptr = *encoded_bytes++;
+          *encoded_ptr++ = kValidByte;
+          *encoded_ptr++ = value;
+        },
+        [&] {
+          auto& encoded_ptr = *encoded_bytes++;
+          *encoded_ptr++ = kNullByte;
+          *encoded_ptr++ = 0;
+        });
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+                                            MemoryPool* pool) override {
+    std::shared_ptr<Buffer> null_buf;
+    int32_t null_count;
+    RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+    ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBitmap(length, pool));
+
+    uint8_t* raw_output = key_buf->mutable_data();
+    for (int32_t i = 0; i < length; ++i) {
+      auto& encoded_ptr = encoded_bytes[i];
+      BitUtil::SetBitTo(raw_output, i, encoded_ptr[0] != 0);
+      encoded_ptr += 1;
+    }
+
+    return ArrayData::Make(boolean(), length, {std::move(null_buf), std::move(key_buf)},
+                           null_count);
+  }
+};
+
+struct FixedWidthKeyEncoder : KeyEncoder {
+  explicit FixedWidthKeyEncoder(std::shared_ptr<DataType> type)
+      : type_(std::move(type)),
+        byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
+
+  void AddLength(const ArrayData& data, int32_t* lengths) override {
+    for (int64_t i = 0; i < data.length; ++i) {
+      lengths[i] += byte_width_ + kExtraByteForNull;
+    }
+  }
+
+  Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+    ArrayData viewed(fixed_size_binary(byte_width_), data.length, data.buffers,
+                     data.null_count, data.offset);
+
+    VisitArrayDataInline<FixedSizeBinaryType>(
+        viewed,
+        [&](util::string_view bytes) {
+          auto& encoded_ptr = *encoded_bytes++;
+          *encoded_ptr++ = kValidByte;
+          memcpy(encoded_ptr, bytes.data(), byte_width_);
+          encoded_ptr += byte_width_;
+        },
+        [&] {
+          auto& encoded_ptr = *encoded_bytes++;
+          *encoded_ptr++ = kNullByte;
+          memset(encoded_ptr, 0, byte_width_);
+          encoded_ptr += byte_width_;
+        });
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+                                            MemoryPool* pool) override {
+    std::shared_ptr<Buffer> null_buf;
+    int32_t null_count;
+    RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+    ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length * byte_width_, pool));
+
+    uint8_t* raw_output = key_buf->mutable_data();
+    for (int32_t i = 0; i < length; ++i) {
+      auto& encoded_ptr = encoded_bytes[i];
+      std::memcpy(raw_output, encoded_ptr, byte_width_);
+      encoded_ptr += byte_width_;
+      raw_output += byte_width_;
+    }
+
+    return ArrayData::Make(type_, length, {std::move(null_buf), std::move(key_buf)},
+                           null_count);
+  }
+
+  std::shared_ptr<DataType> type_;
+  int byte_width_;
+};
+
+struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
+  DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
+      : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
+
+  Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+    auto dict = MakeArray(data.dictionary);
+    if (dictionary_) {
+      if (!dictionary_->Equals(dict)) {
+        // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary
+        // differs from the first we saw for this key
+        return Status::NotImplemented("Unifying differing dictionaries");
+      }
+    } else {
+      dictionary_ = std::move(dict);
+    }
+    return FixedWidthKeyEncoder::Encode(data, encoded_bytes);
+  }
+
+  Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+                                            MemoryPool* pool) override {
+    ARROW_ASSIGN_OR_RAISE(auto data,
+                          FixedWidthKeyEncoder::Decode(encoded_bytes, length, pool));
+
+    if (dictionary_) {
+      data->dictionary = dictionary_->data();
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0));
+      data->dictionary = dict->data();
+    }
+
+    data->type = type_;
+    return data;
+  }
+
+  MemoryPool* pool_;
+  std::shared_ptr<Array> dictionary_;
+};
+
+template <typename T>
+struct VarLengthKeyEncoder : KeyEncoder {
+  using Offset = typename T::offset_type;
+
+  void AddLength(const ArrayData& data, int32_t* lengths) override {
+    int64_t i = 0;
+    VisitArrayDataInline<T>(
+        data,
+        [&](util::string_view bytes) {
+          lengths[i++] +=
+              kExtraByteForNull + sizeof(Offset) + static_cast<int32_t>(bytes.size());
+        },
+        [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); });
+  }
+
+  Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+    VisitArrayDataInline<T>(
+        data,
+        [&](util::string_view bytes) {
+          auto& encoded_ptr = *encoded_bytes++;
+          *encoded_ptr++ = kValidByte;
+          util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
+          encoded_ptr += sizeof(Offset);
+          memcpy(encoded_ptr, bytes.data(), bytes.size());
+          encoded_ptr += bytes.size();
+        },
+        [&] {
+          auto& encoded_ptr = *encoded_bytes++;
+          *encoded_ptr++ = kNullByte;
+          util::SafeStore(encoded_ptr, static_cast<Offset>(0));
+          encoded_ptr += sizeof(Offset);
+        });
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+                                            MemoryPool* pool) override {
+    std::shared_ptr<Buffer> null_buf;
+    int32_t null_count;
+    RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+    Offset length_sum = 0;
+    for (int32_t i = 0; i < length; ++i) {
+      length_sum += util::SafeLoadAs<Offset>(encoded_bytes[i]);
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto offset_buf,
+                          AllocateBuffer(sizeof(Offset) * (1 + length), pool));
+    ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length_sum));
+
+    auto raw_offsets = reinterpret_cast<Offset*>(offset_buf->mutable_data());
+    auto raw_keys = key_buf->mutable_data();
+
+    Offset current_offset = 0;
+    for (int32_t i = 0; i < length; ++i) {
+      raw_offsets[i] = current_offset;
+
+      auto key_length = util::SafeLoadAs<Offset>(encoded_bytes[i]);
+      encoded_bytes[i] += sizeof(Offset);
+
+      memcpy(raw_keys + current_offset, encoded_bytes[i], key_length);
+      encoded_bytes[i] += key_length;
+
+      current_offset += key_length;
+    }
+    raw_offsets[length] = current_offset;
+
+    return ArrayData::Make(
+        type_, length, {std::move(null_buf), std::move(offset_buf), std::move(key_buf)},
+        null_count);
+  }
+
+  explicit VarLengthKeyEncoder(std::shared_ptr<DataType> type) : type_(std::move(type)) {}
+
+  std::shared_ptr<DataType> type_;
+};
+
+struct GrouperImpl : Grouper {
+  static Result<std::unique_ptr<GrouperImpl>> Make(const std::vector<ValueDescr>& keys,
+                                                   ExecContext* ctx) {
+    auto impl = ::arrow::internal::make_unique<GrouperImpl>();
+
+    impl->encoders_.resize(keys.size());
+    impl->ctx_ = ctx;
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      const auto& key = keys[i].type;
+
+      if (key->id() == Type::BOOL) {
+        impl->encoders_[i] = ::arrow::internal::make_unique<BooleanKeyEncoder>();
+        continue;
+      }
+
+      if (key->id() == Type::DICTIONARY) {
+        impl->encoders_[i] =
+            ::arrow::internal::make_unique<DictionaryKeyEncoder>(key, ctx->memory_pool());
+        continue;
+      }
+
+      if (is_fixed_width(key->id())) {
+        impl->encoders_[i] = ::arrow::internal::make_unique<FixedWidthKeyEncoder>(key);
+        continue;
+      }
+
+      if (is_binary_like(key->id())) {
+        impl->encoders_[i] =
+            ::arrow::internal::make_unique<VarLengthKeyEncoder<BinaryType>>(key);
+        continue;
+      }
+
+      if (is_large_binary_like(key->id())) {
+        impl->encoders_[i] =
+            ::arrow::internal::make_unique<VarLengthKeyEncoder<LargeBinaryType>>(key);
+        continue;
+      }
+
+      return Status::NotImplemented("Keys of type ", *key);
+    }
+
+    return std::move(impl);
+  }
+
+  Result<Datum> Consume(const ExecBatch& batch) override {
+    std::vector<int32_t> offsets_batch(batch.length + 1);
+    for (int i = 0; i < batch.num_values(); ++i) {
+      encoders_[i]->AddLength(*batch[i].array(), offsets_batch.data());
+    }
+
+    int32_t total_length = 0;
+    for (int64_t i = 0; i < batch.length; ++i) {
+      auto total_length_before = total_length;
+      total_length += offsets_batch[i];
+      offsets_batch[i] = total_length_before;
+    }
+    offsets_batch[batch.length] = total_length;
+
+    std::vector<uint8_t> key_bytes_batch(total_length);
+    std::vector<uint8_t*> key_buf_ptrs(batch.length);
+    for (int64_t i = 0; i < batch.length; ++i) {
+      key_buf_ptrs[i] = key_bytes_batch.data() + offsets_batch[i];
+    }
+
+    for (int i = 0; i < batch.num_values(); ++i) {
+      RETURN_NOT_OK(encoders_[i]->Encode(*batch[i].array(), key_buf_ptrs.data()));
+    }
+
+    TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
+    RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
+
+    for (int64_t i = 0; i < batch.length; ++i) {
+      int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
+      std::string key(
+          reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
+          key_length);
+
+      auto it_success = map_.emplace(key, num_groups_);
+      auto group_id = it_success.first->second;
+
+      if (it_success.second) {
+        // new key; update offsets and key_bytes
+        ++num_groups_;
+        auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
+        key_bytes_.resize(next_key_offset + key_length);
+        offsets_.push_back(next_key_offset + key_length);
+        memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
+      }
+
+      group_ids_batch.UnsafeAppend(group_id);
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish());
+    return Datum(UInt32Array(batch.length, std::move(group_ids)));
+  }
+
+  uint32_t num_groups() const override { return num_groups_; }
+
+  Result<ExecBatch> GetUniques() override {
+    ExecBatch out({}, num_groups_);
+
+    std::vector<uint8_t*> key_buf_ptrs(num_groups_);
+    for (int64_t i = 0; i < num_groups_; ++i) {
+      key_buf_ptrs[i] = key_bytes_.data() + offsets_[i];
+    }
+
+    out.values.resize(encoders_.size());
+    for (size_t i = 0; i < encoders_.size(); ++i) {
+      ARROW_ASSIGN_OR_RAISE(
+          out.values[i],
+          encoders_[i]->Decode(key_buf_ptrs.data(), static_cast<int32_t>(num_groups_),
+                               ctx_->memory_pool()));
+    }
+
+    return out;
+  }
+
+  ExecContext* ctx_;
+  std::unordered_map<std::string, uint32_t> map_;
+  std::vector<int32_t> offsets_ = {0};
+  std::vector<uint8_t> key_bytes_;
+  uint32_t num_groups_ = 0;
+  std::vector<std::unique_ptr<KeyEncoder>> encoders_;
+};
+
+/// C++ abstract base class for the HashAggregateKernel interface.
+/// Implementations should be default constructible and perform initialization in
+/// Init().
+struct GroupedAggregator : KernelState {
+  virtual Status Init(ExecContext*, const FunctionOptions*,
+                      const std::shared_ptr<DataType>&) = 0;
+
+  virtual Status Consume(const ExecBatch& batch) = 0;
+
+  virtual Result<Datum> Finalize() = 0;
+
+  template <typename Reserve>
+  Status MaybeReserve(int64_t old_num_groups, const ExecBatch& batch,
+                      const Reserve& reserve) {
+    int64_t new_num_groups = batch[2].scalar_as<UInt32Scalar>().value;
+    if (new_num_groups <= old_num_groups) {
+      return Status::OK();
+    }
+    return reserve(new_num_groups - old_num_groups);
+  }
+
+  virtual std::shared_ptr<DataType> out_type() const = 0;
+};
+
+// ----------------------------------------------------------------------
+// Count implementation
+
+struct GroupedCountImpl : public GroupedAggregator {
+  Status Init(ExecContext* ctx, const FunctionOptions* options,
+              const std::shared_ptr<DataType>&) override {
+    options_ = checked_cast<const CountOptions&>(*options);
+    counts_ = BufferBuilder(ctx->memory_pool());
+    return Status::OK();
+  }
+
+  Status Consume(const ExecBatch& batch) override {
+    RETURN_NOT_OK(MaybeReserve(counts_.length(), batch, [&](int64_t added_groups) {
+      num_groups_ += added_groups;
+      return counts_.Append(added_groups * sizeof(int64_t), 0);
+    }));
+
+    auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+    auto raw_counts = reinterpret_cast<int64_t*>(counts_.mutable_data());
+
+    const auto& input = batch[0].array();
+
+    if (options_.count_mode == CountOptions::COUNT_NULL) {
+      for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) {
+        auto g = group_ids[i];
+        raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i);
+      }
+      return Status::OK();
+    }
+
+    arrow::internal::VisitSetBitRunsVoid(
+        input->buffers[0], input->offset, input->length,
+        [&](int64_t begin, int64_t length) {
+          for (int64_t input_i = begin, i = begin - input->offset;
+               input_i < begin + length; ++input_i, ++i) {
+            auto g = group_ids[i];
+            raw_counts[g] += 1;
+          }
+        });
+    return Status::OK();
+  }
+
+  Result<Datum> Finalize() override {
+    ARROW_ASSIGN_OR_RAISE(auto counts, counts_.Finish());
+    return std::make_shared<Int64Array>(num_groups_, std::move(counts));
+  }
+
+  std::shared_ptr<DataType> out_type() const override { return int64(); }
+
+  int64_t num_groups_ = 0;
+  CountOptions options_;
+  BufferBuilder counts_;
+};
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+struct GroupedSumImpl : public GroupedAggregator {
+  // NB: whether we are accumulating into double, int64_t, or uint64_t
+  // we always have 64 bits per group in the sums buffer.
+  static constexpr size_t kSumSize = sizeof(int64_t);
+
+  using ConsumeImpl = std::function<void(const std::shared_ptr<ArrayData>&,
+                                         const uint32_t*, void*, int64_t*)>;
+
+  struct GetConsumeImpl {
+    template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
+    Status Visit(const T&) {
+      consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+                        void* boxed_sums, int64_t* counts) {
+        auto sums = reinterpret_cast<typename TypeTraits<AccType>::CType*>(boxed_sums);
+
+        VisitArrayDataInline<T>(
+            *input,
+            [&](typename TypeTraits<T>::CType value) {
+              sums[*group] += value;
+              counts[*group] += 1;
+              ++group;
+            },
+            [&] { ++group; });
+      };
+      out_type = TypeTraits<AccType>::type_singleton();
+      return Status::OK();
+    }
+
+    Status Visit(const HalfFloatType& type) {
+      return Status::NotImplemented("Summing data of type ", type);
+    }
+
+    Status Visit(const DataType& type) {
+      return Status::NotImplemented("Summing data of type ", type);
+    }
+
+    ConsumeImpl consume_impl;
+    std::shared_ptr<DataType> out_type;
+  };
+
+  Status Init(ExecContext* ctx, const FunctionOptions*,
+              const std::shared_ptr<DataType>& input_type) override {
+    pool_ = ctx->memory_pool();
+    sums_ = BufferBuilder(pool_);
+    counts_ = BufferBuilder(pool_);
+
+    GetConsumeImpl get_consume_impl;
+    RETURN_NOT_OK(VisitTypeInline(*input_type, &get_consume_impl));
+
+    consume_impl_ = std::move(get_consume_impl.consume_impl);
+    out_type_ = std::move(get_consume_impl.out_type);
+
+    return Status::OK();
+  }
+
+  Status Consume(const ExecBatch& batch) override {
+    RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+      num_groups_ += added_groups;
+      RETURN_NOT_OK(sums_.Append(added_groups * kSumSize, 0));
+      RETURN_NOT_OK(counts_.Append(added_groups * sizeof(int64_t), 0));
+      return Status::OK();
+    }));
+
+    auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+    consume_impl_(batch[0].array(), group_ids, sums_.mutable_data(),
+                  reinterpret_cast<int64_t*>(counts_.mutable_data()));
+    return Status::OK();
+  }
+
+  Result<Datum> Finalize() override {
+    std::shared_ptr<Buffer> null_bitmap;
+    int64_t null_count = 0;
+
+    for (int64_t i = 0; i < num_groups_; ++i) {
+      if (reinterpret_cast<const int64_t*>(counts_.data())[i] > 0) continue;
+
+      if (null_bitmap == nullptr) {
+        ARROW_ASSIGN_OR_RAISE(null_bitmap, AllocateBitmap(num_groups_, pool_));
+        BitUtil::SetBitsTo(null_bitmap->mutable_data(), 0, num_groups_, true);
+      }
+
+      null_count += 1;
+      BitUtil::SetBitTo(null_bitmap->mutable_data(), i, false);
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto sums, sums_.Finish());
+
+    return ArrayData::Make(std::move(out_type_), num_groups_,
+                           {std::move(null_bitmap), std::move(sums)}, null_count);
+  }
+
+  std::shared_ptr<DataType> out_type() const override { return out_type_; }
+
+  // NB: counts are used here instead of a simple "has_values_" bitmap since
+  // we expect to reuse this kernel to handle Mean
+  int64_t num_groups_ = 0;
+  BufferBuilder sums_, counts_;
+  std::shared_ptr<DataType> out_type_;
+  ConsumeImpl consume_impl_;
+  MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename CType>
+struct Extrema : std::numeric_limits<CType> {};
+
+template <>
+struct Extrema<float> {
+  static constexpr float min() { return -std::numeric_limits<float>::infinity(); }
+  static constexpr float max() { return std::numeric_limits<float>::infinity(); }
+};
+
+template <>
+struct Extrema<double> {
+  static constexpr double min() { return -std::numeric_limits<double>::infinity(); }
+  static constexpr double max() { return std::numeric_limits<double>::infinity(); }
+};
+
+struct GroupedMinMaxImpl : public GroupedAggregator {
+  using ConsumeImpl =
+      std::function<void(const std::shared_ptr<ArrayData>&, const uint32_t*, void*, void*,
+                         uint8_t*, uint8_t*)>;
+
+  using ResizeImpl = std::function<Status(BufferBuilder*, int64_t)>;
+
+  template <typename CType>
+  static ResizeImpl MakeResizeImpl(CType anti_extreme) {
+    // resize a min or max buffer, storing the correct anti extreme
+    return [anti_extreme](BufferBuilder* builder, int64_t added_groups) {
+      TypedBufferBuilder<CType> typed_builder(std::move(*builder));
+      RETURN_NOT_OK(typed_builder.Append(added_groups, anti_extreme));
+      *builder = std::move(*typed_builder.bytes_builder());
+      return Status::OK();
+    };
+  }
+
+  struct GetImpl {
+    template <typename T, typename CType = typename TypeTraits<T>::CType>
+    enable_if_number<T, Status> Visit(const T&) {
+      consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+                        void* mins, void* maxes, uint8_t* has_values,
+                        uint8_t* has_nulls) {
+        auto raw_mins = reinterpret_cast<CType*>(mins);
+        auto raw_maxes = reinterpret_cast<CType*>(maxes);
+
+        VisitArrayDataInline<T>(
+            *input,
+            [&](CType val) {
+              raw_maxes[*group] = std::max(raw_maxes[*group], val);
+              raw_mins[*group] = std::min(raw_mins[*group], val);
+              BitUtil::SetBit(has_values, *group++);
+            },
+            [&] { BitUtil::SetBit(has_nulls, *group++); });
+      };
+
+      resize_min_impl = MakeResizeImpl(Extrema<CType>::max());
+      resize_max_impl = MakeResizeImpl(Extrema<CType>::min());
+      return Status::OK();
+    }
+
+    Status Visit(const BooleanType& type) {
+      return Status::NotImplemented("Grouped MinMax data of type ", type);
+    }
+
+    Status Visit(const HalfFloatType& type) {
+      return Status::NotImplemented("Grouped MinMax data of type ", type);
+    }
+
+    Status Visit(const DataType& type) {
+      return Status::NotImplemented("Grouped MinMax data of type ", type);
+    }
+
+    ConsumeImpl consume_impl;
+    ResizeImpl resize_min_impl, resize_max_impl;
+  };
+
+  Status Init(ExecContext* ctx, const FunctionOptions* options,
+              const std::shared_ptr<DataType>& input_type) override {
+    options_ = *checked_cast<const MinMaxOptions*>(options);
+    type_ = input_type;
+
+    mins_ = BufferBuilder(ctx->memory_pool());
+    maxes_ = BufferBuilder(ctx->memory_pool());
+    has_values_ = BufferBuilder(ctx->memory_pool());
+    has_nulls_ = BufferBuilder(ctx->memory_pool());
+
+    GetImpl get_impl;
+    RETURN_NOT_OK(VisitTypeInline(*input_type, &get_impl));
+
+    consume_impl_ = std::move(get_impl.consume_impl);
+    resize_min_impl_ = std::move(get_impl.resize_min_impl);
+    resize_max_impl_ = std::move(get_impl.resize_max_impl);
+    resize_bitmap_impl_ = MakeResizeImpl(false);
+
+    return Status::OK();
+  }
+
+  Status Consume(const ExecBatch& batch) override {
+    RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+      num_groups_ += added_groups;
+      RETURN_NOT_OK(resize_min_impl_(&mins_, added_groups));
+      RETURN_NOT_OK(resize_max_impl_(&maxes_, added_groups));
+      RETURN_NOT_OK(resize_bitmap_impl_(&has_values_, added_groups));
+      RETURN_NOT_OK(resize_bitmap_impl_(&has_nulls_, added_groups));
+      return Status::OK();
+    }));
+
+    auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+    consume_impl_(batch[0].array(), group_ids, mins_.mutable_data(),
+                  maxes_.mutable_data(), has_values_.mutable_data(),
+                  has_nulls_.mutable_data());
+    return Status::OK();
+  }
+
+  Result<Datum> Finalize() override {
+    // aggregation for group is valid if there was at least one value in that group
+    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish());
+
+    if (options_.null_handling == MinMaxOptions::EMIT_NULL) {
+      // ... and there were no nulls in that group
+      ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish());
+      arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0,
+                                    num_groups_, 0, null_bitmap->mutable_data());
+    }
+
+    auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr});
+    auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr});
+    ARROW_ASSIGN_OR_RAISE(mins->buffers[1], mins_.Finish());
+    ARROW_ASSIGN_OR_RAISE(maxes->buffers[1], maxes_.Finish());
+
+    return ArrayData::Make(out_type(), num_groups_, {nullptr},
+                           {std::move(mins), std::move(maxes)});
+  }
+
+  std::shared_ptr<DataType> out_type() const override {
+    return struct_({field("min", type_), field("max", type_)});
+  }
+
+  int64_t num_groups_;
+  BufferBuilder mins_, maxes_, has_values_, has_nulls_;
+  std::shared_ptr<DataType> type_;
+  ConsumeImpl consume_impl_;
+  ResizeImpl resize_min_impl_, resize_max_impl_, resize_bitmap_impl_;
+  MinMaxOptions options_;
+};
+
+template <typename Impl>
+HashAggregateKernel MakeKernel(InputType argument_type) {
+  HashAggregateKernel kernel;
+
+  kernel.init = [](KernelContext* ctx,
+                   const KernelInitArgs& args) -> std::unique_ptr<KernelState> {
+    auto impl = ::arrow::internal::make_unique<Impl>();
+    // FIXME(bkietz) Init should not take a type. That should be an unboxed template arg
+    // for the Impl. Otherwise we're not exposing dispatch as well as we should.
+    ctx->SetStatus(impl->Init(ctx->exec_context(), args.options, args.inputs[0].type));
+    if (ctx->HasError()) return nullptr;
+    return std::move(impl);
+  };
+
+  kernel.signature = KernelSignature::Make(
+      {std::move(argument_type), InputType::Array(Type::UINT32),
+       InputType::Scalar(Type::UINT32)},
+      OutputType(
+          [](KernelContext* ctx, const std::vector<ValueDescr>&) -> Result<ValueDescr> {
+            return checked_cast<GroupedAggregator*>(ctx->state())->out_type();
+          }));
+
+  kernel.consume = [](KernelContext* ctx, const ExecBatch& batch) {
+    ctx->SetStatus(checked_cast<GroupedAggregator*>(ctx->state())->Consume(batch));
+  };
+
+  kernel.merge = [](KernelContext* ctx, KernelState&&, KernelState*) {
+    // TODO(ARROW-11840) merge two hash tables
+    ctx->SetStatus(Status::NotImplemented("Merge hashed aggregations"));
+  };
+
+  kernel.finalize = [](KernelContext* ctx, Datum* out) {
+    KERNEL_ASSIGN_OR_RAISE(*out, ctx,
+                           checked_cast<GroupedAggregator*>(ctx->state())->Finalize());
+  };
+
+  return kernel;
+}
+
+Result<std::vector<const HashAggregateKernel*>> GetKernels(
+    ExecContext* ctx, const std::vector<Aggregate>& aggregates,
+    const std::vector<ValueDescr>& in_descrs) {
+  if (aggregates.size() != in_descrs.size()) {
+    return Status::Invalid(aggregates.size(), " aggregate functions were specified but ",
+                           in_descrs.size(), " arguments were provided.");
+  }
+
+  std::vector<const HashAggregateKernel*> kernels(in_descrs.size());
+
+  for (size_t i = 0; i < aggregates.size(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto function,
+                          ctx->func_registry()->GetFunction(aggregates[i].function));
+    ARROW_ASSIGN_OR_RAISE(
+        const Kernel* kernel,
+        function->DispatchExact(
+            {in_descrs[i], ValueDescr::Array(uint32()), ValueDescr::Scalar(uint32())}));
+    kernels[i] = static_cast<const HashAggregateKernel*>(kernel);
+  }
+  return kernels;
+}
+
+Result<std::vector<std::unique_ptr<KernelState>>> InitKernels(
+    const std::vector<const HashAggregateKernel*>& kernels, ExecContext* ctx,
+    const std::vector<Aggregate>& aggregates, const std::vector<ValueDescr>& in_descrs) {
+  std::vector<std::unique_ptr<KernelState>> states(kernels.size());
+
+  for (size_t i = 0; i < aggregates.size(); ++i) {
+    auto options = aggregates[i].options;
+
+    if (options == nullptr) {
+      // use known default options for the named function if possible
+      auto maybe_function = ctx->func_registry()->GetFunction(aggregates[i].function);
+      if (maybe_function.ok()) {
+        options = maybe_function.ValueOrDie()->default_options();
+      }
+    }
+
+    KernelContext kernel_ctx{ctx};
+    states[i] = kernels[i]->init(&kernel_ctx, KernelInitArgs{kernels[i],
+                                                             {
+                                                                 in_descrs[i].type,
+                                                                 uint32(),
+                                                                 uint32(),
+                                                             },
+                                                             options});
+    if (kernel_ctx.HasError()) return kernel_ctx.status();
+  }
+
+  return std::move(states);
+}
+
+Result<FieldVector> ResolveKernels(
+    const std::vector<Aggregate>& aggregates,
+    const std::vector<const HashAggregateKernel*>& kernels,
+    const std::vector<std::unique_ptr<KernelState>>& states, ExecContext* ctx,
+    const std::vector<ValueDescr>& descrs) {
+  FieldVector fields(descrs.size());
+
+  for (size_t i = 0; i < kernels.size(); ++i) {
+    KernelContext kernel_ctx{ctx};
+    kernel_ctx.SetState(states[i].get());
+
+    ARROW_ASSIGN_OR_RAISE(auto descr, kernels[i]->signature->out_type().Resolve(
+                                          &kernel_ctx, {
+                                                           descrs[i].type,
+                                                           uint32(),
+                                                           uint32(),
+                                                       }));
+    fields[i] = field(aggregates[i].function, std::move(descr.type));
+  }
+  return fields;
+}
+
+}  // namespace
+
+Result<std::unique_ptr<Grouper>> Grouper::Make(const std::vector<ValueDescr>& descrs,
+                                               ExecContext* ctx) {
+  return GrouperImpl::Make(descrs, ctx);
+}
+
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+                      const std::vector<Aggregate>& aggregates, ExecContext* ctx) {
+  // Construct and initialize HashAggregateKernels
+  ARROW_ASSIGN_OR_RAISE(auto argument_descrs,
+                        ExecBatch::Make(arguments).Map(
+                            [](ExecBatch batch) { return batch.GetDescriptors(); }));
+
+  ARROW_ASSIGN_OR_RAISE(auto kernels, GetKernels(ctx, aggregates, argument_descrs));
+
+  ARROW_ASSIGN_OR_RAISE(auto states,
+                        InitKernels(kernels, ctx, aggregates, argument_descrs));
+
+  ARROW_ASSIGN_OR_RAISE(
+      FieldVector out_fields,
+      ResolveKernels(aggregates, kernels, states, ctx, argument_descrs));
+
+  using arrow::compute::detail::ExecBatchIterator;
+
+  ARROW_ASSIGN_OR_RAISE(auto argument_batch_iterator,
+                        ExecBatchIterator::Make(arguments, ctx->exec_chunksize()));
+
+  // Construct Grouper
+  ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) {
+    return batch.GetDescriptors();
+  }));
+
+  ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_descrs, ctx));
+
+  int i = 0;
+  for (ValueDescr& key_descr : key_descrs) {
+    out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type)));
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
+                        ExecBatchIterator::Make(keys, ctx->exec_chunksize()));
+
+  // start "streaming" execution
+  ExecBatch key_batch, argument_batch;
+  while (argument_batch_iterator->Next(&argument_batch) &&
+         key_batch_iterator->Next(&key_batch)) {
+    if (key_batch.length == 0) continue;
+
+    // compute a batch of group ids
+    ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
+
+    // consume group ids with HashAggregateKernels
+    for (size_t i = 0; i < kernels.size(); ++i) {
+      KernelContext batch_ctx{ctx};
+      batch_ctx.SetState(states[i].get());
+      ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make({argument_batch[i], id_batch,
+                                                         Datum(grouper->num_groups())}));
+      kernels[i]->consume(&batch_ctx, batch);
+      if (batch_ctx.HasError()) return batch_ctx.status();
+    }
+  }
+
+  // Finalize output
+  ArrayDataVector out_data(arguments.size() + keys.size());
+  auto it = out_data.begin();
+
+  for (size_t i = 0; i < kernels.size(); ++i) {
+    KernelContext batch_ctx{ctx};
+    batch_ctx.SetState(states[i].get());
+    Datum out;
+    kernels[i]->finalize(&batch_ctx, &out);
+    if (batch_ctx.HasError()) return batch_ctx.status();
+    *it++ = out.array();
+  }
+
+  ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, grouper->GetUniques());
+  for (const auto& key : out_keys.values) {
+    *it++ = key.array();
+  }
+
+  int64_t length = out_data[0]->length;
+  return ArrayData::Make(struct_(std::move(out_fields)), length,
+                         {/*null_bitmap=*/nullptr}, std::move(out_data),
+                         /*null_count=*/0);
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::ApplyGroupings(const ListArray& groupings,
+                                                           const Array& array,
+                                                           ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(Datum sorted,
+                        compute::Take(array, groupings.data()->child_data[0],
+                                      TakeOptions::NoBoundsCheck(), ctx));
+
+  return std::make_shared<ListArray>(list(array.type()), groupings.length(),
+                                     groupings.value_offsets(), sorted.make_array());
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::MakeGroupings(const UInt32Array& ids,
+                                                          uint32_t num_groups,
+                                                          ExecContext* ctx) {
+  if (ids.null_count() != 0) {
+    return Status::Invalid("MakeGroupings with null ids");
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto offsets, AllocateBuffer(sizeof(int32_t) * (num_groups + 1),
+                                                     ctx->memory_pool()));
+  auto raw_offsets = reinterpret_cast<int32_t*>(offsets->mutable_data());
+
+  std::memset(raw_offsets, 0, offsets->size());
+  for (int i = 0; i < ids.length(); ++i) {
+    DCHECK_LT(ids.Value(i), num_groups);
+    raw_offsets[ids.Value(i)] += 1;
+  }
+  int32_t length = 0;
+  for (uint32_t id = 0; id < num_groups; ++id) {
+    auto offset = raw_offsets[id];
+    raw_offsets[id] = length;
+    length += offset;
+  }
+  raw_offsets[num_groups] = length;
+  DCHECK_EQ(ids.length(), length);
+
+  ARROW_ASSIGN_OR_RAISE(auto offsets_copy,
+                        offsets->CopySlice(0, offsets->size(), ctx->memory_pool()));
+  raw_offsets = reinterpret_cast<int32_t*>(offsets_copy->mutable_data());
+
+  ARROW_ASSIGN_OR_RAISE(auto sort_indices, AllocateBuffer(sizeof(int32_t) * ids.length(),
+                                                          ctx->memory_pool()));
+  auto raw_sort_indices = reinterpret_cast<int32_t*>(sort_indices->mutable_data());
+  for (int i = 0; i < ids.length(); ++i) {
+    raw_sort_indices[raw_offsets[ids.Value(i)]++] = i;
+  }
+
+  return std::make_shared<ListArray>(
+      list(int32()), num_groups, std::move(offsets),
+      std::make_shared<Int32Array>(ids.length(), std::move(sort_indices)));
+}
+
+namespace {
+const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
+                                 ("By default, non-null values are counted.\n"
+                                  "This can be changed through CountOptions."),
+                                 {"array", "group_id_array", "group_count"},
+                                 "CountOptions"};
+
+const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
+                               ("Null values are ignored."),
+                               {"array", "group_id_array", "group_count"}};
+
+const FunctionDoc hash_min_max_doc{
+    "Compute the minimum and maximum values of a numeric array",
+    ("Null values are ignored by default.\n"
+     "This can be changed through MinMaxOptions."),
+    {"array", "group_id_array", "group_count"},
+    "MinMaxOptions"};
+}  // namespace
+
+void RegisterHashAggregateBasic(FunctionRegistry* registry) {
+  {
+    static auto default_count_options = CountOptions::Defaults();
+    auto func = std::make_shared<HashAggregateFunction>(
+        "hash_count", Arity::Ternary(), &hash_count_doc, &default_count_options);
+    DCHECK_OK(func->AddKernel(MakeKernel<GroupedCountImpl>(ValueDescr::ARRAY)));
+    DCHECK_OK(registry->AddFunction(std::move(func)));
+  }
+
+  {
+    auto func = std::make_shared<HashAggregateFunction>("hash_sum", Arity::Ternary(),
+                                                        &hash_sum_doc);
+    DCHECK_OK(func->AddKernel(MakeKernel<GroupedSumImpl>(ValueDescr::ARRAY)));
+    DCHECK_OK(registry->AddFunction(std::move(func)));
+  }
+
+  {
+    static auto default_minmax_options = MinMaxOptions::Defaults();
+    auto func = std::make_shared<HashAggregateFunction>(
+        "hash_min_max", Arity::Ternary(), &hash_min_max_doc, &default_minmax_options);
+    DCHECK_OK(func->AddKernel(MakeKernel<GroupedMinMaxImpl>(ValueDescr::ARRAY)));
+    DCHECK_OK(registry->AddFunction(std::move(func)));
+  }
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
new file mode 100644
index 00000000000..7858d8bb147
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -0,0 +1,703 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/kernels/test_util.h"
+#include "arrow/compute/registry.h"
+#include "arrow/testing/generator.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+
+using testing::HasSubstr;
+
+namespace arrow {
+
+using internal::BitmapReader;
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+namespace {
+
+Result<Datum> NaiveGroupBy(std::vector<Datum> arguments, std::vector<Datum> keys,
+                           const std::vector<internal::Aggregate>& aggregates) {
+  ARROW_ASSIGN_OR_RAISE(auto key_batch, ExecBatch::Make(std::move(keys)));
+
+  ARROW_ASSIGN_OR_RAISE(auto grouper,
+                        internal::Grouper::Make(key_batch.GetDescriptors()));
+
+  ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
+
+  ARROW_ASSIGN_OR_RAISE(
+      auto groupings, internal::Grouper::MakeGroupings(*id_batch.array_as<UInt32Array>(),
+                                                       grouper->num_groups()));
+
+  ArrayVector out_columns;
+  std::vector<std::string> out_names;
+
+  for (size_t i = 0; i < arguments.size(); ++i) {
+    out_names.push_back(aggregates[i].function);
+
+    // trim "hash_" prefix
+    auto scalar_agg_function = aggregates[i].function.substr(5);
+
+    ARROW_ASSIGN_OR_RAISE(
+        auto grouped_argument,
+        internal::Grouper::ApplyGroupings(*groupings, *arguments[i].make_array()));
+
+    ScalarVector aggregated_scalars;
+
+    for (int64_t i_group = 0; i_group < grouper->num_groups(); ++i_group) {
+      auto slice = grouped_argument->value_slice(i_group);
+      if (slice->length() == 0) continue;
+      ARROW_ASSIGN_OR_RAISE(
+          Datum d, CallFunction(scalar_agg_function, {slice}, aggregates[i].options));
+      aggregated_scalars.push_back(d.scalar());
+    }
+
+    ARROW_ASSIGN_OR_RAISE(Datum aggregated_column,
+                          ScalarVectorToArray(aggregated_scalars));
+    out_columns.push_back(aggregated_column.make_array());
+  }
+
+  int i = 0;
+  ARROW_ASSIGN_OR_RAISE(auto uniques, grouper->GetUniques());
+  for (const Datum& key : uniques.values) {
+    out_columns.push_back(key.make_array());
+    out_names.push_back("key_" + std::to_string(i++));
+  }
+
+  return StructArray::Make(std::move(out_columns), std::move(out_names));
+}
+
+void ValidateGroupBy(const std::vector<internal::Aggregate>& aggregates,
+                     std::vector<Datum> arguments, std::vector<Datum> keys) {
+  ASSERT_OK_AND_ASSIGN(Datum expected, NaiveGroupBy(arguments, keys, aggregates));
+
+  ASSERT_OK_AND_ASSIGN(Datum actual, GroupBy(arguments, keys, aggregates));
+
+  ASSERT_OK(expected.make_array()->ValidateFull());
+  ASSERT_OK(actual.make_array()->ValidateFull());
+
+  AssertDatumsEqual(expected, actual, /*verbose=*/true);
+}
+
+}  // namespace
+
+TEST(Grouper, SupportedKeys) {
+  ASSERT_OK(internal::Grouper::Make({boolean()}));
+
+  ASSERT_OK(internal::Grouper::Make({int8(), uint16(), int32(), uint64()}));
+
+  ASSERT_OK(internal::Grouper::Make({dictionary(int64(), utf8())}));
+
+  ASSERT_OK(internal::Grouper::Make({float16(), float32(), float64()}));
+
+  ASSERT_OK(internal::Grouper::Make({utf8(), binary(), large_utf8(), large_binary()}));
+
+  ASSERT_OK(internal::Grouper::Make({fixed_size_binary(16), fixed_size_binary(32)}));
+
+  ASSERT_OK(internal::Grouper::Make({decimal128(32, 10), decimal256(76, 20)}));
+
+  ASSERT_OK(internal::Grouper::Make({date32(), date64()}));
+
+  for (auto unit : {
+           TimeUnit::SECOND,
+           TimeUnit::MILLI,
+           TimeUnit::MICRO,
+           TimeUnit::NANO,
+       }) {
+    ASSERT_OK(internal::Grouper::Make({timestamp(unit), duration(unit)}));
+  }
+
+  ASSERT_OK(internal::Grouper::Make({day_time_interval(), month_interval()}));
+
+  ASSERT_RAISES(NotImplemented, internal::Grouper::Make({struct_({field("", int64())})}));
+
+  ASSERT_RAISES(NotImplemented, internal::Grouper::Make({struct_({})}));
+
+  ASSERT_RAISES(NotImplemented, internal::Grouper::Make({list(int32())}));
+
+  ASSERT_RAISES(NotImplemented, internal::Grouper::Make({fixed_size_list(int32(), 5)}));
+
+  ASSERT_RAISES(NotImplemented,
+                internal::Grouper::Make({dense_union({field("", int32())})}));
+}
+
+struct TestGrouper {
+  explicit TestGrouper(std::vector<ValueDescr> descrs) : descrs_(std::move(descrs)) {
+    grouper_ = internal::Grouper::Make(descrs_).ValueOrDie();
+
+    FieldVector fields;
+    for (const auto& descr : descrs_) {
+      fields.push_back(field("", descr.type));
+    }
+    key_schema_ = schema(std::move(fields));
+  }
+
+  void ExpectConsume(const std::string& key_json, const std::string& expected) {
+    ExpectConsume(ExecBatch(*RecordBatchFromJSON(key_schema_, key_json)),
+                  ArrayFromJSON(uint32(), expected));
+  }
+
+  void ExpectConsume(const std::vector<Datum>& key_batch, Datum expected) {
+    ExpectConsume(*ExecBatch::Make(key_batch), expected);
+  }
+
+  void ExpectConsume(const ExecBatch& key_batch, Datum expected) {
+    Datum ids;
+    ConsumeAndValidate(key_batch, &ids);
+    AssertDatumsEqual(expected, ids, /*verbose=*/true);
+  }
+
+  void ConsumeAndValidate(const ExecBatch& key_batch, Datum* ids = nullptr) {
+    ASSERT_OK_AND_ASSIGN(Datum id_batch, grouper_->Consume(key_batch));
+
+    ValidateConsume(key_batch, id_batch);
+
+    if (ids) {
+      *ids = std::move(id_batch);
+    }
+  }
+
+  void ValidateConsume(const ExecBatch& key_batch, const Datum& id_batch) {
+    if (uniques_.length == -1) {
+      ASSERT_OK_AND_ASSIGN(uniques_, grouper_->GetUniques());
+    } else if (static_cast<int64_t>(grouper_->num_groups()) > uniques_.length) {
+      ASSERT_OK_AND_ASSIGN(ExecBatch new_uniques, grouper_->GetUniques());
+
+      // check that uniques_ are prefixes of new_uniques
+      for (int i = 0; i < uniques_.num_values(); ++i) {
+        auto new_unique = new_uniques[i].make_array();
+        ASSERT_OK(new_unique->ValidateFull());
+
+        AssertDatumsEqual(uniques_[i], new_unique->Slice(0, uniques_.length),
+                          /*verbose=*/true);
+      }
+
+      uniques_ = std::move(new_uniques);
+    }
+
+    // check that the ids encode an equivalent key sequence
+    auto ids = id_batch.make_array();
+    ASSERT_OK(ids->ValidateFull());
+
+    for (int i = 0; i < key_batch.num_values(); ++i) {
+      SCOPED_TRACE(std::to_string(i) + "th key array");
+      auto original = key_batch[i].make_array();
+      ASSERT_OK_AND_ASSIGN(auto encoded, Take(*uniques_[i].make_array(), *ids));
+      AssertArraysEqual(*original, *encoded, /*verbose=*/true,
+                        EqualOptions().nans_equal(true));
+    }
+  }
+
+  std::vector<ValueDescr> descrs_;
+  std::shared_ptr<Schema> key_schema_;
+  std::unique_ptr<internal::Grouper> grouper_;
+  ExecBatch uniques_ = ExecBatch({}, -1);
+};
+
+TEST(Grouper, BooleanKey) {
+  TestGrouper g({boolean()});
+
+  g.ExpectConsume("[[true], [true]]", "[0, 0]");
+
+  g.ExpectConsume("[[true], [true]]", "[0, 0]");
+
+  g.ExpectConsume("[[false], [null]]", "[1, 2]");
+
+  g.ExpectConsume("[[true], [false], [true], [false], [null], [false], [null]]",
+                  "[0, 1, 0, 1, 2, 1, 2]");
+}
+
+TEST(Grouper, NumericKey) {
+  for (auto ty : {
+           uint8(),
+           int8(),
+           uint16(),
+           int16(),
+           uint32(),
+           int32(),
+           uint64(),
+           int64(),
+           float16(),
+           float32(),
+           float64(),
+       }) {
+    SCOPED_TRACE("key type: " + ty->ToString());
+
+    TestGrouper g({ty});
+
+    g.ExpectConsume("[[3], [3]]", "[0, 0]");
+
+    g.ExpectConsume("[[3], [3]]", "[0, 0]");
+
+    g.ExpectConsume("[[27], [81]]", "[1, 2]");
+
+    g.ExpectConsume("[[3], [27], [3], [27], [null], [81], [27], [81]]",
+                    "[0, 1, 0, 1, 3, 2, 1, 2]");
+  }
+}
+
+TEST(Grouper, FloatingPointKey) {
+  TestGrouper g({float32()});
+
+  // -0.0 hashes differently from 0.0
+  g.ExpectConsume("[[0.0], [-0.0]]", "[0, 1]");
+
+  g.ExpectConsume("[[Inf], [-Inf]]", "[2, 3]");
+
+  // assert(!(NaN == NaN)) does not cause spurious new groups
+  g.ExpectConsume("[[NaN], [NaN]]", "[4, 4]");
+
+  // TODO(bkietz) test denormal numbers, more NaNs
+}
+
+TEST(Grouper, StringKey) {
+  for (auto ty : {utf8(), large_utf8(), fixed_size_binary(2)}) {
+    SCOPED_TRACE("key type: " + ty->ToString());
+
+    TestGrouper g({ty});
+
+    g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]");
+
+    g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]");
+
+    g.ExpectConsume(R"([["be"], [null]])", "[1, 2]");
+  }
+}
+
+TEST(Grouper, DictKey) {
+  TestGrouper g({dictionary(int32(), utf8())});
+
+  // For dictionary keys, all batches must share a single dictionary.
+  // Eventually, differing dictionaries will be unified and indices transposed
+  // during encoding to relieve this restriction.
+  const auto dict = ArrayFromJSON(utf8(), R"(["ex", "why", "zee", null])");
+
+  auto WithIndices = [&](const std::string& indices) {
+    return Datum(*DictionaryArray::FromArrays(ArrayFromJSON(int32(), indices), dict));
+  };
+
+  // NB: null index is not considered equivalent to index=3 (which encodes null in dict)
+  g.ExpectConsume({WithIndices("           [3, 1, null, 0, 2]")},
+                  ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]"));
+
+  g = TestGrouper({dictionary(int32(), utf8())});
+
+  g.ExpectConsume({WithIndices("           [0, 1, 2, 3, null]")},
+                  ArrayFromJSON(uint32(), "[0, 1, 2, 3, 4]"));
+
+  g.ExpectConsume({WithIndices("           [3, 1, null, 0, 2]")},
+                  ArrayFromJSON(uint32(), "[3, 1, 4,    0, 2]"));
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented, HasSubstr("Unifying differing dictionaries"),
+      g.grouper_->Consume(*ExecBatch::Make({*DictionaryArray::FromArrays(
+          ArrayFromJSON(int32(), "[0, 1]"),
+          ArrayFromJSON(utf8(), R"(["different", "dictionary"])"))})));
+}
+
+TEST(Grouper, StringInt64Key) {
+  TestGrouper g({utf8(), int64()});
+
+  g.ExpectConsume(R"([["eh", 0], ["eh", 0]])", "[0, 0]");
+
+  g.ExpectConsume(R"([["eh", 0], ["eh", null]])", "[0, 1]");
+
+  g.ExpectConsume(R"([["eh", 1], ["bee", 1]])", "[2, 3]");
+
+  g.ExpectConsume(R"([["eh", null], ["bee", 1]])", "[1, 3]");
+
+  g = TestGrouper({utf8(), int64()});
+
+  g.ExpectConsume(R"([
+    ["ex",  0],
+    ["ex",  0],
+    ["why", 0],
+    ["ex",  1],
+    ["why", 0],
+    ["ex",  1],
+    ["ex",  0],
+    ["why", 1]
+  ])",
+                  "[0, 0, 1, 2, 1, 2, 0, 3]");
+
+  g.ExpectConsume(R"([
+    ["ex",  0],
+    [null,  0],
+    [null,  0],
+    ["ex",  1],
+    [null,  null],
+    ["ex",  1],
+    ["ex",  0],
+    ["why", null]
+  ])",
+                  "[0, 4, 4, 2, 5, 2, 0, 6]");
+}
+
+TEST(Grouper, DoubleStringInt64Key) {
+  TestGrouper g({float64(), utf8(), int64()});
+
+  g.ExpectConsume(R"([[1.5, "eh", 0], [1.5, "eh", 0]])", "[0, 0]");
+
+  g.ExpectConsume(R"([[1.5, "eh", 0], [1.5, "eh", 0]])", "[0, 0]");
+
+  g.ExpectConsume(R"([[1.0, "eh", 0], [1.0, "be", null]])", "[1, 2]");
+
+  // note: -0 and +0 hash differently
+  g.ExpectConsume(R"([[-0.0, "be", 7], [0.0, "be", 7]])", "[3, 4]");
+}
+
+TEST(Grouper, RandomInt64Keys) {
+  TestGrouper g({int64()});
+  for (int i = 0; i < 4; ++i) {
+    SCOPED_TRACE(std::to_string(i) + "th key batch");
+
+    ExecBatch key_batch{
+        *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)};
+    g.ConsumeAndValidate(key_batch);
+  }
+}
+
+TEST(Grouper, RandomStringInt64Keys) {
+  TestGrouper g({utf8(), int64()});
+  for (int i = 0; i < 4; ++i) {
+    SCOPED_TRACE(std::to_string(i) + "th key batch");
+
+    ExecBatch key_batch{
+        *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)};
+    g.ConsumeAndValidate(key_batch);
+  }
+}
+
+TEST(Grouper, RandomStringInt64DoubleInt32Keys) {
+  TestGrouper g({utf8(), int64(), float64(), int32()});
+  for (int i = 0; i < 4; ++i) {
+    SCOPED_TRACE(std::to_string(i) + "th key batch");
+
+    ExecBatch key_batch{
+        *random::GenerateBatch(g.key_schema_->fields(), 1 << 12, 0xDEADBEEF)};
+    g.ConsumeAndValidate(key_batch);
+  }
+}
+
+TEST(Grouper, MakeGroupings) {
+  auto ExpectGroupings = [](std::string ids_json, std::string expected_json) {
+    auto ids = checked_pointer_cast<UInt32Array>(ArrayFromJSON(uint32(), ids_json));
+    auto expected = ArrayFromJSON(list(int32()), expected_json);
+
+    auto num_groups = static_cast<uint32_t>(expected->length());
+    ASSERT_OK_AND_ASSIGN(auto actual, internal::Grouper::MakeGroupings(*ids, num_groups));
+    AssertArraysEqual(*expected, *actual, /*verbose=*/true);
+
+    // validate ApplyGroupings
+    ASSERT_OK_AND_ASSIGN(auto grouped_ids,
+                         internal::Grouper::ApplyGroupings(*actual, *ids));
+
+    for (uint32_t group = 0; group < num_groups; ++group) {
+      auto ids_slice = checked_pointer_cast<UInt32Array>(grouped_ids->value_slice(group));
+      for (auto slot : *ids_slice) {
+        EXPECT_EQ(slot, group);
+      }
+    }
+  };
+
+  ExpectGroupings("[]", "[[]]");
+
+  ExpectGroupings("[0, 0, 0]", "[[0, 1, 2]]");
+
+  ExpectGroupings("[0, 0, 0, 1, 1, 2]", "[[0, 1, 2], [3, 4], [5], []]");
+
+  ExpectGroupings("[2, 1, 2, 1, 1, 2]", "[[], [1, 3, 4], [0, 2, 5], [], []]");
+
+  ExpectGroupings("[2, 2, 5, 5, 2, 3]", "[[], [], [0, 1, 4], [5], [], [2, 3], [], []]");
+
+  auto ids = checked_pointer_cast<UInt32Array>(ArrayFromJSON(uint32(), "[0, null, 1]"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("MakeGroupings with null ids"),
+                                  internal::Grouper::MakeGroupings(*ids, 5));
+}
+
+TEST(GroupBy, Errors) {
+  auto batch = RecordBatchFromJSON(
+      schema({field("argument", float64()), field("group_id", uint32())}), R"([
+    [1.0,   1],
+    [null,  1],
+    [0.0,   2],
+    [null,  3],
+    [4.0,   0],
+    [3.25,  1],
+    [0.125, 2],
+    [-0.25, 2],
+    [0.75,  0],
+    [null,  3]
+  ])");
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented, HasSubstr("Direct execution of HASH_AGGREGATE functions"),
+      CallFunction("hash_sum", {batch->GetColumnByName("argument"),
+                                batch->GetColumnByName("group_id"), Datum(uint32_t(4))}));
+}
+
+TEST(GroupBy, SumOnly) {
+  auto batch = RecordBatchFromJSON(
+      schema({field("argument", float64()), field("key", int64())}), R"([
+    [1.0,   1],
+    [null,  1],
+    [0.0,   2],
+    [null,  3],
+    [4.0,   null],
+    [3.25,  1],
+    [0.125, 2],
+    [-0.25, 2],
+    [0.75,  null],
+    [null,  3]
+  ])");
+
+  ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
+                       internal::GroupBy({batch->GetColumnByName("argument")},
+                                         {batch->GetColumnByName("key")},
+                                         {
+                                             {"hash_sum", nullptr},
+                                         }));
+
+  AssertDatumsEqual(ArrayFromJSON(struct_({
+                                      field("hash_sum", float64()),
+                                      field("key_0", int64()),
+                                  }),
+                                  R"([
+    [4.25,   1],
+    [-0.125, 2],
+    [null,   3],
+    [4.75,   null]
+  ])"),
+                    aggregated_and_grouped,
+                    /*verbose=*/true);
+}
+
+TEST(GroupBy, MinMaxOnly) {
+  auto batch = RecordBatchFromJSON(
+      schema({field("argument", float64()), field("key", int64())}), R"([
+    [1.0,   1],
+    [null,  1],
+    [0.0,   2],
+    [null,  3],
+    [4.0,   null],
+    [3.25,  1],
+    [0.125, 2],
+    [-0.25, 2],
+    [0.75,  null],
+    [null,  3]
+  ])");
+
+  ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
+                       internal::GroupBy({batch->GetColumnByName("argument")},
+                                         {batch->GetColumnByName("key")},
+                                         {
+                                             {"hash_min_max", nullptr},
+                                         }));
+
+  AssertDatumsEqual(ArrayFromJSON(struct_({
+                                      field("hash_min_max", struct_({
+                                                                field("min", float64()),
+                                                                field("max", float64()),
+                                                            })),
+                                      field("key_0", int64()),
+                                  }),
+                                  R"([
+    [{"min": 1.0,   "max": 3.25},  1],
+    [{"min": -0.25, "max": 0.125}, 2],
+    [{"min": null,  "max": null},  3],
+    [{"min": 0.75,  "max": 4.0},   null]
+  ])"),
+                    aggregated_and_grouped,
+                    /*verbose=*/true);
+}
+
+TEST(GroupBy, CountAndSum) {
+  auto batch = RecordBatchFromJSON(
+      schema({field("argument", float64()), field("key", int64())}), R"([
+    [1.0,   1],
+    [null,  1],
+    [0.0,   2],
+    [null,  3],
+    [4.0,   null],
+    [3.25,  1],
+    [0.125, 2],
+    [-0.25, 2],
+    [0.75,  null],
+    [null,  3]
+  ])");
+
+  CountOptions count_options;
+  ASSERT_OK_AND_ASSIGN(
+      Datum aggregated_and_grouped,
+      internal::GroupBy(
+          {
+              // NB: passing an argument twice or also using it as a key is legal
+              batch->GetColumnByName("argument"),
+              batch->GetColumnByName("argument"),
+              batch->GetColumnByName("key"),
+          },
+          {
+              batch->GetColumnByName("key"),
+          },
+          {
+              {"hash_count", &count_options},
+              {"hash_sum", nullptr},
+              {"hash_sum", nullptr},
+          }));
+
+  AssertDatumsEqual(
+      ArrayFromJSON(struct_({
+                        field("hash_count", int64()),
+                        // NB: summing a float32 array results in float64 sums
+                        field("hash_sum", float64()),
+                        field("hash_sum", int64()),
+                        field("key_0", int64()),
+                    }),
+                    R"([
+    [2, 4.25,   3,    1],
+    [3, -0.125, 6,    2],
+    [0, null,   6,    3],
+    [2, 4.75,   null, null]
+  ])"),
+      aggregated_and_grouped,
+      /*verbose=*/true);
+}
+
+TEST(GroupBy, SumOnlyStringAndDictKeys) {
+  for (auto key_type : {utf8(), dictionary(int32(), utf8())}) {
+    SCOPED_TRACE("key type: " + key_type->ToString());
+
+    auto batch = RecordBatchFromJSON(
+        schema({field("argument", float64()), field("key", key_type)}), R"([
+      [1.0,   "alfa"],
+      [null,  "alfa"],
+      [0.0,   "beta"],
+      [null,  "gama"],
+      [4.0,    null ],
+      [3.25,  "alfa"],
+      [0.125, "beta"],
+      [-0.25, "beta"],
+      [0.75,   null ],
+      [null,  "gama"]
+    ])");
+
+    ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
+                         internal::GroupBy({batch->GetColumnByName("argument")},
+                                           {batch->GetColumnByName("key")},
+                                           {
+                                               {"hash_sum", nullptr},
+                                           }));
+
+    AssertDatumsEqual(ArrayFromJSON(struct_({
+                                        field("hash_sum", float64()),
+                                        field("key_0", key_type),
+                                    }),
+                                    R"([
+    [4.25,   "alfa"],
+    [-0.125, "beta"],
+    [null,   "gama"],
+    [4.75,    null ]
+  ])"),
+                      aggregated_and_grouped,
+                      /*verbose=*/true);
+  }
+}
+
+TEST(GroupBy, ConcreteCaseWithValidateGroupBy) {
+  auto batch = RecordBatchFromJSON(
+      schema({field("argument", float64()), field("key", utf8())}), R"([
+    [1.0,   "alfa"],
+    [null,  "alfa"],
+    [0.0,   "beta"],
+    [null,  "gama"],
+    [4.0,    null ],
+    [3.25,  "alfa"],
+    [0.125, "beta"],
+    [-0.25, "beta"],
+    [0.75,   null ],
+    [null,  "gama"]
+  ])");
+
+  CountOptions count_non_null{CountOptions::COUNT_NON_NULL},
+      count_null{CountOptions::COUNT_NULL};
+
+  MinMaxOptions emit_null{MinMaxOptions::EMIT_NULL};
+
+  using internal::Aggregate;
+  for (auto agg : {
+           Aggregate{"hash_sum", nullptr},
+           Aggregate{"hash_count", &count_non_null},
+           Aggregate{"hash_count", &count_null},
+           Aggregate{"hash_min_max", nullptr},
+           Aggregate{"hash_min_max", &emit_null},
+       }) {
+    SCOPED_TRACE(agg.function);
+    ValidateGroupBy({agg}, {batch->GetColumnByName("argument")},
+                    {batch->GetColumnByName("key")});
+  }
+}
+
+TEST(GroupBy, RandomArraySum) {
+  for (int64_t length : {1 << 10, 1 << 12, 1 << 15}) {
+    for (auto null_probability : {0.0, 0.01, 0.5, 1.0}) {
+      auto batch = random::GenerateBatch(
+          {
+              field("argument", float32(),
+                    key_value_metadata(
+                        {{"null_probability", std::to_string(null_probability)}})),
+              field("key", int64(), key_value_metadata({{"min", "0"}, {"max", "100"}})),
+          },
+          length, 0xDEADBEEF);
+
+      ValidateGroupBy(
+          {
+              {"hash_sum", nullptr},
+          },
+          {batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")});
+    }
+  }
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 77890d27da5..160c4ce8857 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -302,8 +302,8 @@ struct CastFunctor<O, I, enable_if_base_binary<I>> {
 // Decimal to integer
 
 struct DecimalToIntegerMixin {
-  template <typename OutValue>
-  OutValue ToInteger(KernelContext* ctx, const Decimal128& val) const {
+  template <typename OutValue, typename Arg0Value>
+  OutValue ToInteger(KernelContext* ctx, const Arg0Value& val) const {
     constexpr auto min_value = std::numeric_limits<OutValue>::min();
     constexpr auto max_value = std::numeric_limits<OutValue>::max();
 
@@ -326,7 +326,7 @@ struct UnsafeUpscaleDecimalToInteger : public DecimalToIntegerMixin {
   using DecimalToIntegerMixin::DecimalToIntegerMixin;
 
   template <typename OutValue, typename Arg0Value>
-  OutValue Call(KernelContext* ctx, Decimal128 val) const {
+  OutValue Call(KernelContext* ctx, Arg0Value val) const {
     return ToInteger<OutValue>(ctx, val.IncreaseScaleBy(-in_scale_));
   }
 };
@@ -335,7 +335,7 @@ struct UnsafeDownscaleDecimalToInteger : public DecimalToIntegerMixin {
   using DecimalToIntegerMixin::DecimalToIntegerMixin;
 
   template <typename OutValue, typename Arg0Value>
-  OutValue Call(KernelContext* ctx, Decimal128 val) const {
+  OutValue Call(KernelContext* ctx, Arg0Value val) const {
     return ToInteger<OutValue>(ctx, val.ReduceScaleBy(in_scale_, false));
   }
 };
@@ -344,7 +344,7 @@ struct SafeRescaleDecimalToInteger : public DecimalToIntegerMixin {
   using DecimalToIntegerMixin::DecimalToIntegerMixin;
 
   template <typename OutValue, typename Arg0Value>
-  OutValue Call(KernelContext* ctx, Decimal128 val) const {
+  OutValue Call(KernelContext* ctx, Arg0Value val) const {
     auto result = val.Rescale(in_scale_, 0);
     if (ARROW_PREDICT_FALSE(!result.ok())) {
       ctx->SetStatus(result.status());
@@ -355,35 +355,33 @@ struct SafeRescaleDecimalToInteger : public DecimalToIntegerMixin {
   }
 };
 
-template <typename O>
-struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+                   enable_if_t<is_integer_type<O>::value && is_decimal_type<I>::value>> {
   using out_type = typename O::c_type;
 
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     const auto& options = checked_cast<const CastState*>(ctx->state())->options;
 
-    const auto& in_type_inst = checked_cast<const Decimal128Type&>(*batch[0].type());
+    const auto& in_type_inst = checked_cast<const I&>(*batch[0].type());
     const auto in_scale = in_type_inst.scale();
 
     if (options.allow_decimal_truncate) {
       if (in_scale < 0) {
         // Unsafe upscale
-        applicator::ScalarUnaryNotNullStateful<O, Decimal128Type,
-                                               UnsafeUpscaleDecimalToInteger>
+        applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimalToInteger>
             kernel(UnsafeUpscaleDecimalToInteger{in_scale, options.allow_int_overflow});
         return kernel.Exec(ctx, batch, out);
       } else {
         // Unsafe downscale
-        applicator::ScalarUnaryNotNullStateful<O, Decimal128Type,
-                                               UnsafeDownscaleDecimalToInteger>
+        applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimalToInteger>
             kernel(UnsafeDownscaleDecimalToInteger{in_scale, options.allow_int_overflow});
         return kernel.Exec(ctx, batch, out);
       }
     } else {
       // Safe rescale
-      applicator::ScalarUnaryNotNullStateful<O, Decimal128Type,
-                                             SafeRescaleDecimalToInteger>
-          kernel(SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow});
+      applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimalToInteger> kernel(
+          SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow});
       return kernel.Exec(ctx, batch, out);
     }
   }
@@ -392,72 +390,104 @@ struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
 // ----------------------------------------------------------------------
 // Decimal to decimal
 
+// Helper that converts the input and output decimals
+// For instance, Decimal128 -> Decimal256 requires converting, then scaling
+// Decimal256 -> Decimal128 requires scaling, then truncating
+template <typename OutDecimal, typename InDecimal>
+struct DecimalConversions {};
+
+template <typename InDecimal>
+struct DecimalConversions<Decimal256, InDecimal> {
+  // Convert then scale
+  static Decimal256 ConvertInput(InDecimal&& val) { return Decimal256(val); }
+  static Decimal256 ConvertOutput(Decimal256&& val) { return val; }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal256> {
+  // Scale then truncate
+  static Decimal256 ConvertInput(Decimal256&& val) { return val; }
+  static Decimal128 ConvertOutput(Decimal256&& val) {
+    return Decimal128(val.little_endian_array()[1], val.little_endian_array()[0]);
+  }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal128> {
+  static Decimal128 ConvertInput(Decimal128&& val) { return val; }
+  static Decimal128 ConvertOutput(Decimal128&& val) { return val; }
+};
+
 struct UnsafeUpscaleDecimal {
-  template <typename... Unused>
-  Decimal128 Call(KernelContext* ctx, Decimal128 val) const {
-    return val.IncreaseScaleBy(by_);
+  template <typename OutValue, typename Arg0Value>
+  OutValue Call(KernelContext* ctx, Arg0Value val) const {
+    using Conv = DecimalConversions<OutValue, Arg0Value>;
+    return Conv::ConvertOutput(Conv::ConvertInput(std::move(val)).IncreaseScaleBy(by_));
   }
   int32_t by_;
 };
 
 struct UnsafeDownscaleDecimal {
-  template <typename... Unused>
-  Decimal128 Call(KernelContext* ctx, Decimal128 val) const {
-    return val.ReduceScaleBy(by_, false);
+  template <typename OutValue, typename Arg0Value>
+  OutValue Call(KernelContext* ctx, Arg0Value val) const {
+    using Conv = DecimalConversions<OutValue, Arg0Value>;
+    return Conv::ConvertOutput(
+        Conv::ConvertInput(std::move(val)).ReduceScaleBy(by_, false));
   }
   int32_t by_;
 };
 
 struct SafeRescaleDecimal {
-  template <typename... Unused>
-  Decimal128 Call(KernelContext* ctx, Decimal128 val) const {
-    auto maybe_rescaled = val.Rescale(in_scale_, out_scale_);
+  template <typename OutValue, typename Arg0Value>
+  OutValue Call(KernelContext* ctx, Arg0Value val) const {
+    using Conv = DecimalConversions<OutValue, Arg0Value>;
+    auto maybe_rescaled =
+        Conv::ConvertInput(std::move(val)).Rescale(in_scale_, out_scale_);
     if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) {
       ctx->SetStatus(maybe_rescaled.status());
       return {};  // Zero
     }
 
     if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) {
-      return maybe_rescaled.MoveValueUnsafe();
+      return Conv::ConvertOutput(maybe_rescaled.MoveValueUnsafe());
     }
 
-    ctx->SetStatus(Status::Invalid("Decimal value does not fit in precision"));
+    ctx->SetStatus(
+        Status::Invalid("Decimal value does not fit in precision ", out_precision_));
     return {};  // Zero
   }
 
   int32_t out_scale_, out_precision_, in_scale_;
 };
 
-template <>
-struct CastFunctor<Decimal128Type, Decimal128Type> {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+                   enable_if_t<is_decimal_type<O>::value && is_decimal_type<I>::value>> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     const auto& options = checked_cast<const CastState*>(ctx->state())->options;
 
-    const auto& in_type = checked_cast<const Decimal128Type&>(*batch[0].type());
-    const auto& out_type = checked_cast<const Decimal128Type&>(*out->type());
+    const auto& in_type = checked_cast<const I&>(*batch[0].type());
+    const auto& out_type = checked_cast<const O&>(*out->type());
     const auto in_scale = in_type.scale();
     const auto out_scale = out_type.scale();
 
     if (options.allow_decimal_truncate) {
       if (in_scale < out_scale) {
         // Unsafe upscale
-        applicator::ScalarUnaryNotNullStateful<Decimal128Type, Decimal128Type,
-                                               UnsafeUpscaleDecimal>
-            kernel(UnsafeUpscaleDecimal{out_scale - in_scale});
+        applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimal> kernel(
+            UnsafeUpscaleDecimal{out_scale - in_scale});
         return kernel.Exec(ctx, batch, out);
       } else {
         // Unsafe downscale
-        applicator::ScalarUnaryNotNullStateful<Decimal128Type, Decimal128Type,
-                                               UnsafeDownscaleDecimal>
-            kernel(UnsafeDownscaleDecimal{in_scale - out_scale});
+        applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimal> kernel(
+            UnsafeDownscaleDecimal{in_scale - out_scale});
         return kernel.Exec(ctx, batch, out);
       }
     }
 
     // Safe rescale
-    applicator::ScalarUnaryNotNullStateful<Decimal128Type, Decimal128Type,
-                                           SafeRescaleDecimal>
-        kernel(SafeRescaleDecimal{out_scale, out_type.precision(), in_scale});
+    applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimal> kernel(
+        SafeRescaleDecimal{out_scale, out_type.precision(), in_scale});
     return kernel.Exec(ctx, batch, out);
   }
 };
@@ -467,8 +497,8 @@ struct CastFunctor<Decimal128Type, Decimal128Type> {
 
 struct RealToDecimal {
   template <typename OutValue, typename RealType>
-  Decimal128 Call(KernelContext* ctx, RealType val) const {
-    auto maybe_decimal = Decimal128::FromReal(val, out_precision_, out_scale_);
+  OutValue Call(KernelContext* ctx, RealType val) const {
+    auto maybe_decimal = OutValue::FromReal(val, out_precision_, out_scale_);
 
     if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) {
       return maybe_decimal.MoveValueUnsafe();
@@ -484,15 +514,16 @@ struct RealToDecimal {
   bool allow_truncate_;
 };
 
-template <typename I>
-struct CastFunctor<Decimal128Type, I, enable_if_t<is_floating_type<I>::value>> {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+                   enable_if_t<is_decimal_type<O>::value && is_floating_type<I>::value>> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     const auto& options = checked_cast<const CastState*>(ctx->state())->options;
-    const auto& out_type = checked_cast<const Decimal128Type&>(*out->type());
+    const auto& out_type = checked_cast<const O&>(*out->type());
     const auto out_scale = out_type.scale();
     const auto out_precision = out_type.precision();
 
-    applicator::ScalarUnaryNotNullStateful<Decimal128Type, I, RealToDecimal> kernel(
+    applicator::ScalarUnaryNotNullStateful<O, I, RealToDecimal> kernel(
         RealToDecimal{out_scale, out_precision, options.allow_decimal_truncate});
     return kernel.Exec(ctx, batch, out);
   }
@@ -503,20 +534,21 @@ struct CastFunctor<Decimal128Type, I, enable_if_t<is_floating_type<I>::value>> {
 
 struct DecimalToReal {
   template <typename RealType, typename Arg0Value>
-  RealType Call(KernelContext* ctx, const Decimal128& val) const {
-    return val.ToReal<RealType>(in_scale_);
+  RealType Call(KernelContext* ctx, const Arg0Value& val) const {
+    return val.template ToReal<RealType>(in_scale_);
   }
 
   int32_t in_scale_;
 };
 
-template <typename O>
-struct CastFunctor<O, Decimal128Type, enable_if_t<is_floating_type<O>::value>> {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+                   enable_if_t<is_floating_type<O>::value && is_decimal_type<I>::value>> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const auto& in_type = checked_cast<const Decimal128Type&>(*batch[0].type());
+    const auto& in_type = checked_cast<const I&>(*batch[0].type());
     const auto in_scale = in_type.scale();
 
-    applicator::ScalarUnaryNotNullStateful<O, Decimal128Type, DecimalToReal> kernel(
+    applicator::ScalarUnaryNotNullStateful<O, I, DecimalToReal> kernel(
         DecimalToReal{in_scale});
     return kernel.Exec(ctx, batch, out);
   }
@@ -562,6 +594,8 @@ std::shared_ptr<CastFunction> GetCastToInteger(std::string name) {
   // From decimal to integer
   DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
                             CastFunctor<OutType, Decimal128Type>::Exec));
+  DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+                            CastFunctor<OutType, Decimal256Type>::Exec));
   return func;
 }
 
@@ -586,6 +620,8 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
   // From decimal to floating point
   DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
                             CastFunctor<OutType, Decimal128Type>::Exec));
+  DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+                            CastFunctor<OutType, Decimal256Type>::Exec));
   return func;
 }
 
@@ -606,6 +642,9 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
   // We resolve the output type of this kernel from the CastOptions
   DCHECK_OK(
       func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+  exec = CastFunctor<Decimal128Type, Decimal256Type>::Exec;
+  DCHECK_OK(
+      func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
   return func;
 }
 
@@ -613,10 +652,21 @@ std::shared_ptr<CastFunction> GetCastToDecimal256() {
   OutputType sig_out_ty(ResolveOutputFromOptions);
 
   auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
-  // Needed for Parquet conversion. Full implementation is ARROW-10606
-  // tracks full implementation.
   AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
 
+  // Cast from floating point
+  DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
+                            CastFunctor<Decimal256Type, FloatType>::Exec));
+  DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
+                            CastFunctor<Decimal256Type, DoubleType>::Exec));
+
+  // Cast from other decimal
+  auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
+  DCHECK_OK(
+      func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+  exec = CastFunctor<Decimal256Type, Decimal256Type>::Exec;
+  DCHECK_OK(
+      func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
   return func;
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 99a56346c1b..10e5ed26e5d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -404,7 +404,7 @@ TEST(Cast, IntToFloating) {
                  CastOptions::Safe(float64()));
 }
 
-TEST(Cast, DecimalToInt) {
+TEST(Cast, Decimal128ToInt) {
   auto options = CastOptions::Safe(int64());
 
   for (bool allow_int_overflow : {false, true}) {
@@ -494,7 +494,98 @@ TEST(Cast, DecimalToInt) {
   CheckCast(negative_scale, ArrayFromJSON(int64(), "[1234567890000, -120000]"), options);
 }
 
-TEST(Cast, DecimalToDecimal) {
+TEST(Cast, Decimal256ToInt) {
+  auto options = CastOptions::Safe(int64());
+
+  for (bool allow_int_overflow : {false, true}) {
+    for (bool allow_decimal_truncate : {false, true}) {
+      options.allow_int_overflow = allow_int_overflow;
+      options.allow_decimal_truncate = allow_decimal_truncate;
+
+      auto no_overflow_no_truncation = ArrayFromJSON(decimal256(40, 10), R"([
+          "02.0000000000",
+         "-11.0000000000",
+          "22.0000000000",
+        "-121.0000000000",
+        null])");
+      CheckCast(no_overflow_no_truncation,
+                ArrayFromJSON(int64(), "[2, -11, 22, -121, null]"), options);
+    }
+  }
+
+  for (bool allow_int_overflow : {false, true}) {
+    options.allow_int_overflow = allow_int_overflow;
+    auto truncation_but_no_overflow = ArrayFromJSON(decimal256(40, 10), R"([
+          "02.1000000000",
+         "-11.0000004500",
+          "22.0000004500",
+        "-121.1210000000",
+        null])");
+
+    options.allow_decimal_truncate = true;
+    CheckCast(truncation_but_no_overflow,
+              ArrayFromJSON(int64(), "[2, -11, 22, -121, null]"), options);
+
+    options.allow_decimal_truncate = false;
+    CheckCastFails(truncation_but_no_overflow, options);
+  }
+
+  for (bool allow_decimal_truncate : {false, true}) {
+    options.allow_decimal_truncate = allow_decimal_truncate;
+
+    auto overflow_no_truncation = ArrayFromJSON(decimal256(40, 10), R"([
+        "1234567890123456789000000.0000000000",
+        "9999999999999999999999999.0000000000",
+        null])");
+
+    options.allow_int_overflow = true;
+    CheckCast(overflow_no_truncation,
+              ArrayFromJSON(
+                  int64(),
+                  // 1234567890123456789000000 % 2**64, 9999999999999999999999999 % 2**64
+                  "[1096246371337547584, 1590897978359414783, null]"),
+              options);
+
+    options.allow_int_overflow = false;
+    CheckCastFails(overflow_no_truncation, options);
+  }
+
+  for (bool allow_int_overflow : {false, true}) {
+    for (bool allow_decimal_truncate : {false, true}) {
+      options.allow_int_overflow = allow_int_overflow;
+      options.allow_decimal_truncate = allow_decimal_truncate;
+
+      auto overflow_and_truncation = ArrayFromJSON(decimal256(40, 10), R"([
+        "1234567890123456789000000.0045345000",
+        "9999999999999999999999999.0000344300",
+        null])");
+
+      if (options.allow_int_overflow && options.allow_decimal_truncate) {
+        CheckCast(
+            overflow_and_truncation,
+            ArrayFromJSON(
+                int64(),
+                // 1234567890123456789000000 % 2**64, 9999999999999999999999999 % 2**64
+                "[1096246371337547584, 1590897978359414783, null]"),
+            options);
+      } else {
+        CheckCastFails(overflow_and_truncation, options);
+      }
+    }
+  }
+
+  Decimal256Builder builder(decimal256(40, -4));
+  for (auto d : {Decimal256("1234567890000."), Decimal256("-120000.")}) {
+    ASSERT_OK_AND_ASSIGN(d, d.Rescale(0, -4));
+    ASSERT_OK(builder.Append(d));
+  }
+  ASSERT_OK_AND_ASSIGN(auto negative_scale, builder.Finish());
+  options.allow_int_overflow = true;
+  options.allow_decimal_truncate = true;
+  CheckCast(negative_scale, ArrayFromJSON(int64(), "[1234567890000, -120000]"), options);
+}
+
+TEST(Cast, Decimal128ToDecimal128) {
   CastOptions options;
 
   for (bool allow_decimal_truncate : {false, true}) {
@@ -573,51 +664,306 @@ TEST(Cast, DecimalToDecimal) {
   }
 }
 
-TEST(Cast, FloatingToDecimal) {
-  for (auto float_type : {float32(), float64()}) {
-    CheckCast(
-        ArrayFromJSON(float_type, "[0.0, null, 123.45, 123.456, 999.994]"),
-        ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "123.45", "123.46", "999.99"])"));
+TEST(Cast, Decimal256ToDecimal256) {
+  CastOptions options;
 
-    // Overflow
-    CastOptions options;
-    options.to_type = decimal(5, 2);
-    CheckCastFails(ArrayFromJSON(float_type, "[999.996]"), options);
+  for (bool allow_decimal_truncate : {false, true}) {
+    options.allow_decimal_truncate = allow_decimal_truncate;
+
+    auto no_truncation = ArrayFromJSON(decimal256(38, 10), R"([
+          "02.0000000000",
+          "30.0000000000",
+          "22.0000000000",
+        "-121.0000000000",
+        null])");
+    auto expected = ArrayFromJSON(decimal256(28, 0), R"([
+          "02.",
+          "30.",
+          "22.",
+        "-121.",
+        null])");
+
+    CheckCast(no_truncation, expected, options);
+    CheckCast(expected, no_truncation, options);
+  }
+
+  for (bool allow_decimal_truncate : {false, true}) {
+    options.allow_decimal_truncate = allow_decimal_truncate;
+
+    // Same scale, different precision
+    auto d_5_2 = ArrayFromJSON(decimal256(5, 2), R"([
+          "12.34",
+           "0.56"])");
+    auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([
+          "12.34",
+           "0.56"])");
+
+    CheckCast(d_5_2, d_4_2, options);
+    CheckCast(d_4_2, d_5_2, options);
+  }
+
+  auto d_38_10 = ArrayFromJSON(decimal256(38, 10), R"([
+      "-02.1234567890",
+       "30.1234567890",
+      null])");
+
+  auto d_28_0 = ArrayFromJSON(decimal256(28, 0), R"([
+      "-02.",
+       "30.",
+      null])");
+
+  auto d_38_10_roundtripped = ArrayFromJSON(decimal256(38, 10), R"([
+      "-02.0000000000",
+       "30.0000000000",
+      null])");
 
+  // Rescale which leads to truncation
+  options.allow_decimal_truncate = true;
+  CheckCast(d_38_10, d_28_0, options);
+  CheckCast(d_28_0, d_38_10_roundtripped, options);
+
+  options.allow_decimal_truncate = false;
+  options.to_type = d_28_0->type();
+  CheckCastFails(d_38_10, options);
+  CheckCast(d_28_0, d_38_10_roundtripped, options);
+
+  // Precision loss without rescale leads to truncation
+  auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"(["12.34"])");
+  for (auto expected : {
+           ArrayFromJSON(decimal256(3, 2), R"(["12.34"])"),
+           ArrayFromJSON(decimal256(4, 3), R"(["12.340"])"),
+           ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"),
+       }) {
     options.allow_decimal_truncate = true;
-    CheckCast(
-        ArrayFromJSON(float_type, "[0.0, null, 999.996, 123.45, 999.994]"),
-        ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "0.00", "123.45", "999.99"])"),
-        options);
+    CheckCast(d_4_2, expected, options);
+
+    options.allow_decimal_truncate = false;
+    options.to_type = expected->type();
+    CheckCastFails(d_4_2, options);
   }
+}
 
-  // 2**64 + 2**41 (exactly representable as a float)
-  CheckCast(ArrayFromJSON(float32(), "[1.8446746e+19, -1.8446746e+19]"),
-            ArrayFromJSON(decimal(20, 0),
-                          R"(["18446746272732807168", "-18446746272732807168"])"));
+TEST(Cast, Decimal128ToDecimal256) {
+  CastOptions options;
 
-  CheckCast(ArrayFromJSON(float32(), "[1.8446746e+15, -1.8446746e+15]"),
-            ArrayFromJSON(decimal(20, 4),
-                          R"(["1844674627273280.7168", "-1844674627273280.7168"])"));
+  for (bool allow_decimal_truncate : {false, true}) {
+    options.allow_decimal_truncate = allow_decimal_truncate;
 
-  CheckCast(ArrayFromJSON(float64(), "[1.8446744073709556e+19, -1.8446744073709556e+19]"),
-            ArrayFromJSON(decimal(20, 0),
-                          R"(["18446744073709555712", "-18446744073709555712"])"));
+    auto no_truncation = ArrayFromJSON(decimal(38, 10), R"([
+          "02.0000000000",
+          "30.0000000000",
+          "22.0000000000",
+        "-121.0000000000",
+        null])");
+    auto expected = ArrayFromJSON(decimal256(48, 0), R"([
+          "02.",
+          "30.",
+          "22.",
+        "-121.",
+        null])");
+
+    CheckCast(no_truncation, expected, options);
+  }
+
+  for (bool allow_decimal_truncate : {false, true}) {
+    options.allow_decimal_truncate = allow_decimal_truncate;
+
+    // Same scale, different precision
+    auto d_5_2 = ArrayFromJSON(decimal(5, 2), R"([
+          "12.34",
+           "0.56"])");
+    auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([
+          "12.34",
+           "0.56"])");
+    auto d_40_2 = ArrayFromJSON(decimal256(40, 2), R"([
+          "12.34",
+           "0.56"])");
+
+    CheckCast(d_5_2, d_4_2, options);
+    CheckCast(d_5_2, d_40_2, options);
+  }
+
+  auto d128_38_10 = ArrayFromJSON(decimal(38, 10), R"([
+      "-02.1234567890",
+       "30.1234567890",
+      null])");
+
+  auto d128_28_0 = ArrayFromJSON(decimal(28, 0), R"([
+      "-02.",
+       "30.",
+      null])");
+
+  auto d256_28_0 = ArrayFromJSON(decimal256(28, 0), R"([
+      "-02.",
+       "30.",
+      null])");
+
+  auto d256_38_10_roundtripped = ArrayFromJSON(decimal256(38, 10), R"([
+      "-02.0000000000",
+       "30.0000000000",
+      null])");
 
-  CheckCast(ArrayFromJSON(float64(), "[1.8446744073709556e+15, -1.8446744073709556e+15]"),
-            ArrayFromJSON(decimal(20, 4),
-                          R"(["1844674407370955.5712", "-1844674407370955.5712"])"));
+  // Rescale which leads to truncation
+  options.allow_decimal_truncate = true;
+  CheckCast(d128_38_10, d256_28_0, options);
+  CheckCast(d128_28_0, d256_38_10_roundtripped, options);
+
+  options.allow_decimal_truncate = false;
+  options.to_type = d256_28_0->type();
+  CheckCastFails(d128_38_10, options);
+  CheckCast(d128_28_0, d256_38_10_roundtripped, options);
 
-  // Edge cases are tested for Decimal128::FromReal()
+  // Precision loss without rescale leads to truncation
+  auto d128_4_2 = ArrayFromJSON(decimal(4, 2), R"(["12.34"])");
+  for (auto expected : {
+           ArrayFromJSON(decimal256(3, 2), R"(["12.34"])"),
+           ArrayFromJSON(decimal256(4, 3), R"(["12.340"])"),
+           ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"),
+       }) {
+    options.allow_decimal_truncate = true;
+    CheckCast(d128_4_2, expected, options);
+
+    options.allow_decimal_truncate = false;
+    options.to_type = expected->type();
+    CheckCastFails(d128_4_2, options);
+  }
+}
+
+TEST(Cast, Decimal256ToDecimal128) {
+  CastOptions options;
+
+  for (bool allow_decimal_truncate : {false, true}) {
+    options.allow_decimal_truncate = allow_decimal_truncate;
+
+    auto no_truncation = ArrayFromJSON(decimal256(42, 10), R"([
+          "02.0000000000",
+          "30.0000000000",
+          "22.0000000000",
+        "-121.0000000000",
+        null])");
+    auto expected = ArrayFromJSON(decimal(28, 0), R"([
+          "02.",
+          "30.",
+          "22.",
+        "-121.",
+        null])");
+
+    CheckCast(no_truncation, expected, options);
+  }
+
+  for (bool allow_decimal_truncate : {false, true}) {
+    options.allow_decimal_truncate = allow_decimal_truncate;
+
+    // Same scale, different precision
+    auto d_5_2 = ArrayFromJSON(decimal256(42, 2), R"([
+          "12.34",
+           "0.56"])");
+    auto d_4_2 = ArrayFromJSON(decimal(4, 2), R"([
+          "12.34",
+           "0.56"])");
+
+    CheckCast(d_5_2, d_4_2, options);
+  }
+
+  auto d256_52_10 = ArrayFromJSON(decimal256(52, 10), R"([
+      "-02.1234567890",
+       "30.1234567890",
+      null])");
+
+  auto d256_42_0 = ArrayFromJSON(decimal256(42, 0), R"([
+      "-02.",
+       "30.",
+      null])");
+
+  auto d128_28_0 = ArrayFromJSON(decimal(28, 0), R"([
+      "-02.",
+       "30.",
+      null])");
+
+  auto d128_38_10_roundtripped = ArrayFromJSON(decimal(38, 10), R"([
+      "-02.0000000000",
+       "30.0000000000",
+      null])");
+
+  // Rescale which leads to truncation
+  options.allow_decimal_truncate = true;
+  CheckCast(d256_52_10, d128_28_0, options);
+  CheckCast(d256_42_0, d128_38_10_roundtripped, options);
+
+  options.allow_decimal_truncate = false;
+  options.to_type = d128_28_0->type();
+  CheckCastFails(d256_52_10, options);
+  CheckCast(d256_42_0, d128_38_10_roundtripped, options);
+
+  // Precision loss without rescale leads to truncation
+  auto d256_4_2 = ArrayFromJSON(decimal256(4, 2), R"(["12.34"])");
+  for (auto expected : {
+           ArrayFromJSON(decimal(3, 2), R"(["12.34"])"),
+           ArrayFromJSON(decimal(4, 3), R"(["12.340"])"),
+           ArrayFromJSON(decimal(2, 1), R"(["12.3"])"),
+       }) {
+    options.allow_decimal_truncate = true;
+    CheckCast(d256_4_2, expected, options);
+
+    options.allow_decimal_truncate = false;
+    options.to_type = expected->type();
+    CheckCastFails(d256_4_2, options);
+  }
+}
+
+TEST(Cast, FloatingToDecimal) {
+  for (auto float_type : {float32(), float64()}) {
+    for (auto decimal_type : {decimal(5, 2), decimal256(5, 2)}) {
+      CheckCast(
+          ArrayFromJSON(float_type, "[0.0, null, 123.45, 123.456, 999.994]"),
+          ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "123.46", "999.99"])"));
+
+      // Overflow
+      CastOptions options;
+      options.to_type = decimal_type;
+      CheckCastFails(ArrayFromJSON(float_type, "[999.996]"), options);
+
+      options.allow_decimal_truncate = true;
+      CheckCast(
+          ArrayFromJSON(float_type, "[0.0, null, 999.996, 123.45, 999.994]"),
+          ArrayFromJSON(decimal_type, R"(["0.00", null, "0.00", "123.45", "999.99"])"),
+          options);
+    }
+  }
+
+  for (auto decimal_type : {decimal128, decimal256}) {
+    // 2**64 + 2**41 (exactly representable as a float)
+    CheckCast(ArrayFromJSON(float32(), "[1.8446746e+19, -1.8446746e+19]"),
+              ArrayFromJSON(decimal_type(20, 0),
+                            R"(["18446746272732807168", "-18446746272732807168"])"));
+
+    CheckCast(
+        ArrayFromJSON(float64(), "[1.8446744073709556e+19, -1.8446744073709556e+19]"),
+        ArrayFromJSON(decimal_type(20, 0),
+                      R"(["18446744073709555712", "-18446744073709555712"])"));
+
+    CheckCast(ArrayFromJSON(float32(), "[1.8446746e+15, -1.8446746e+15]"),
+              ArrayFromJSON(decimal_type(20, 4),
+                            R"(["1844674627273280.7168", "-1844674627273280.7168"])"));
+
+    CheckCast(
+        ArrayFromJSON(float64(), "[1.8446744073709556e+15, -1.8446744073709556e+15]"),
+        ArrayFromJSON(decimal_type(20, 4),
+                      R"(["1844674407370955.5712", "-1844674407370955.5712"])"));
+
+    // Edge cases are tested for Decimal128::FromReal() and Decimal256::FromReal
+  }
 }
 
 TEST(Cast, DecimalToFloating) {
   for (auto float_type : {float32(), float64()}) {
-    CheckCast(ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "123.45", "999.99"])"),
-              ArrayFromJSON(float_type, "[0.0, null, 123.45, 999.99]"));
+    for (auto decimal_type : {decimal(5, 2), decimal256(5, 2)}) {
+      CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"),
+                ArrayFromJSON(float_type, "[0.0, null, 123.45, 999.99]"));
+    }
   }
 
-  // Edge cases are tested for Decimal128::ToReal()
+  // Edge cases are tested for Decimal128::ToReal() and Decimal256::ToReal()
 }
 
 TEST(Cast, TimestampToTimestamp) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 2eeac71c727..39869879561 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -23,6 +23,10 @@
 #include <utf8proc.h>
 #endif
 
+#ifdef ARROW_WITH_RE2
+#include <re2/re2.h>
+#endif
+
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_nested.h"
 #include "arrow/buffer_builder.h"
@@ -64,6 +68,22 @@ struct BinaryLength {
   }
 };
 
+struct Utf8Length {
+  template <typename OutValue, typename Arg0Value = util::string_view>
+  static OutValue Call(KernelContext*, Arg0Value val) {
+    auto str = reinterpret_cast<const uint8_t*>(val.data());
+    auto strlen = val.size();
+
+    OutValue length = 0;
+    while (strlen > 0) {
+      length += ((*str & 0xc0) != 0x80);
+      ++str;
+      --strlen;
+    }
+    return length;
+  }
+};
+
 #ifdef ARROW_WITH_UTF8PROC
 
 // Direct lookup tables for unicode properties
@@ -1214,6 +1234,197 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+// ----------------------------------------------------------------------
+// Replace substring (plain, regex)
+
+template <typename Type, typename Replacer>
+struct ReplaceSubString {
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using offset_type = typename Type::offset_type;
+  using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
+  using OffsetBuilder = TypedBufferBuilder<offset_type>;
+  using State = OptionsWrapper<ReplaceSubstringOptions>;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // TODO Cache replacer accross invocations (for regex compilation)
+    Replacer replacer{ctx, State::Get(ctx)};
+    if (!ctx->HasError()) {
+      Replace(ctx, batch, &replacer, out);
+    }
+  }
+
+  static void Replace(KernelContext* ctx, const ExecBatch& batch, Replacer* replacer,
+                      Datum* out) {
+    ValueDataBuilder value_data_builder(ctx->memory_pool());
+    OffsetBuilder offset_builder(ctx->memory_pool());
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      // We already know how many strings we have, so we can use Reserve/UnsafeAppend
+      KERNEL_RETURN_IF_ERROR(ctx, offset_builder.Reserve(batch[0].array()->length));
+      offset_builder.UnsafeAppend(0);  // offsets start at 0
+
+      const ArrayData& input = *batch[0].array();
+      KERNEL_RETURN_IF_ERROR(
+          ctx, VisitArrayDataInline<Type>(
+                   input,
+                   [&](util::string_view s) {
+                     RETURN_NOT_OK(replacer->ReplaceString(s, &value_data_builder));
+                     offset_builder.UnsafeAppend(
+                         static_cast<offset_type>(value_data_builder.length()));
+                     return Status::OK();
+                   },
+                   [&]() {
+                     // offset for null value
+                     offset_builder.UnsafeAppend(
+                         static_cast<offset_type>(value_data_builder.length()));
+                     return Status::OK();
+                   }));
+      ArrayData* output = out->mutable_array();
+      KERNEL_RETURN_IF_ERROR(ctx, value_data_builder.Finish(&output->buffers[2]));
+      KERNEL_RETURN_IF_ERROR(ctx, offset_builder.Finish(&output->buffers[1]));
+    } else {
+      const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+      auto result = std::make_shared<ScalarType>();
+      if (input.is_valid) {
+        util::string_view s = static_cast<util::string_view>(*input.value);
+        KERNEL_RETURN_IF_ERROR(ctx, replacer->ReplaceString(s, &value_data_builder));
+        KERNEL_RETURN_IF_ERROR(ctx, value_data_builder.Finish(&result->value));
+        result->is_valid = true;
+      }
+      out->value = result;
+    }
+  }
+};
+
+struct PlainSubStringReplacer {
+  const ReplaceSubstringOptions& options_;
+
+  PlainSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& options)
+      : options_(options) {}
+
+  Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) {
+    const char* i = s.begin();
+    const char* end = s.end();
+    int64_t max_replacements = options_.max_replacements;
+    while ((i < end) && (max_replacements != 0)) {
+      const char* pos =
+          std::search(i, end, options_.pattern.begin(), options_.pattern.end());
+      if (pos == end) {
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(end - i)));
+        i = end;
+      } else {
+        // the string before the pattern
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(pos - i)));
+        // the replacement
+        RETURN_NOT_OK(
+            builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
+                            options_.replacement.length()));
+        // skip pattern
+        i = pos + options_.pattern.length();
+        max_replacements--;
+      }
+    }
+    // if we exited early due to max_replacements, add the trailing part
+    RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                  static_cast<int64_t>(end - i)));
+    return Status::OK();
+  }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubStringReplacer {
+  const ReplaceSubstringOptions& options_;
+  const RE2 regex_find_;
+  const RE2 regex_replacement_;
+
+  // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
+  // we have 2 regexes, one with () around it, one without.
+  RegexSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& options)
+      : options_(options),
+        regex_find_("(" + options_.pattern + ")"),
+        regex_replacement_(options_.pattern) {
+    if (!(regex_find_.ok() && regex_replacement_.ok())) {
+      ctx->SetStatus(Status::Invalid("Regular expression error"));
+      return;
+    }
+  }
+
+  Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) {
+    re2::StringPiece replacement(options_.replacement);
+    if (options_.max_replacements == -1) {
+      std::string s_copy(s.to_string());
+      re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
+      RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
+                                    s_copy.length()));
+      return Status::OK();
+    }
+
+    // Since RE2 does not have the concept of max_replacements, we have to do some work
+    // ourselves.
+    // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
+    const char* i = s.begin();
+    const char* end = s.end();
+    re2::StringPiece piece(s.data(), s.length());
+
+    int64_t max_replacements = options_.max_replacements;
+    while ((i < end) && (max_replacements != 0)) {
+      std::string found;
+      if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(end - i)));
+        i = end;
+      } else {
+        // wind back to the beginning of the match
+        const char* pos = piece.begin() - found.length();
+        // the string before the pattern
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(pos - i)));
+        // replace the pattern in what we found
+        if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
+          return Status::Invalid("Regex found, but replacement failed");
+        }
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
+                                      static_cast<int64_t>(found.length())));
+        // skip pattern
+        i = piece.begin();
+        max_replacements--;
+      }
+    }
+    // If we exited early due to max_replacements, add the trailing part
+    RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                  static_cast<int64_t>(end - i)));
+    return Status::OK();
+  }
+};
+#endif
+
+template <typename Type>
+using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
+
+const FunctionDoc replace_substring_doc(
+    "Replace non-overlapping substrings that match pattern by replacement",
+    ("For each string in `strings`, replace non-overlapping substrings that match\n"
+     "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
+     "maximum amount of replacements made, counting from the left. Null values emit\n"
+     "null."),
+    {"strings"}, "ReplaceSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename Type>
+using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
+
+const FunctionDoc replace_substring_regex_doc(
+    "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
+    ("For each string in `strings`, replace non-overlapping substrings that match the\n"
+     "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
+     "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
+     "made, counting from the left. Note that if the pattern contains groups,\n"
+     "backreferencing macan be used. Null values emit null."),
+    {"strings"}, "ReplaceSubstringOptions");
+#endif
+
 // ----------------------------------------------------------------------
 // strptime string parsing
 
@@ -1569,9 +1780,14 @@ const FunctionDoc strptime_doc(
 
 const FunctionDoc binary_length_doc(
     "Compute string lengths",
-    ("For each string in `strings`, emit its length.  Null values emit null."),
+    ("For each string in `strings`, emit the number of bytes.  Null values emit null."),
     {"strings"});
 
+const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
+                                  ("For each string in `strings`, emit the number of "
+                                   "UTF8 characters.  Null values emit null."),
+                                  {"strings"});
+
 void AddStrptime(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
   DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
@@ -1597,6 +1813,21 @@ void AddBinaryLength(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+void AddUtf8Length(FunctionRegistry* registry) {
+  auto func =
+      std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
+
+  ArrayKernelExec exec_offset_32 =
+      applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
+  DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
+
+  ArrayKernelExec exec_offset_64 =
+      applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
+  DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
+
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
 template <template <typename> class ExecFunctor>
 void MakeUnaryStringBatchKernel(
     std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
@@ -1866,7 +2097,16 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
 
   AddSplit(registry);
   AddBinaryLength(registry);
+  AddUtf8Length(registry);
   AddMatchSubstring(registry);
+  MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
+      "replace_substring", registry, &replace_substring_doc,
+      MemAllocation::NO_PREALLOCATE);
+#ifdef ARROW_WITH_RE2
+  MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
+      "replace_substring_regex", registry, &replace_substring_regex_doc,
+      MemAllocation::NO_PREALLOCATE);
+#endif
   AddStrptime(registry);
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index d72c631bdcd..88622e842d1 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -48,6 +48,14 @@ class BaseTestStringKernels : public ::testing::Test {
     CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, options);
   }
 
+  void CheckBinaryScalar(std::string func_name, std::string json_left_input,
+                         std::string json_right_scalar, std::shared_ptr<DataType> out_ty,
+                         std::string json_expected,
+                         const FunctionOptions* options = nullptr) {
+    CheckScalarBinaryScalar(func_name, type(), json_left_input, json_right_scalar, out_ty,
+                            json_expected, options);
+  }
+
   std::shared_ptr<DataType> type() { return TypeTraits<TestType>::type_singleton(); }
 
   std::shared_ptr<DataType> offset_type() {
@@ -61,8 +69,8 @@ class TestBinaryKernels : public BaseTestStringKernels<TestType> {};
 TYPED_TEST_SUITE(TestBinaryKernels, BinaryTypes);
 
 TYPED_TEST(TestBinaryKernels, BinaryLength) {
-  this->CheckUnary("binary_length", R"(["aaa", null, "", "b"])", this->offset_type(),
-                   "[3, null, 0, 1]");
+  this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])",
+                   this->offset_type(), "[3, null, 10, 0, 1]");
 }
 
 template <typename TestType>
@@ -101,6 +109,12 @@ TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {
                                   CallFunction("utf8_upper", {scalar}, options));
 }
 
+TYPED_TEST(TestStringKernels, Utf8Length) {
+  this->CheckUnary("utf8_length",
+                   R"(["aaa", null, "áéíóú", "ɑɽⱤoW😀", "áéí 0😀", "", "b"])",
+                   this->offset_type(), "[3, null, 5, 6, 6, 0, 1]");
+}
+
 #ifdef ARROW_WITH_UTF8PROC
 
 TYPED_TEST(TestStringKernels, Utf8Upper) {
@@ -416,6 +430,52 @@ TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8Reverse) {
                    &options_max);
 }
 
+TYPED_TEST(TestStringKernels, ReplaceSubstring) {
+  ReplaceSubstringOptions options{"foo", "bazz"};
+  this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
+                   this->type(), R"(["bazz", "this bazz that bazz", null])", &options);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringLimited) {
+  ReplaceSubstringOptions options{"foo", "bazz", 1};
+  this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
+                   this->type(), R"(["bazz", "this bazz that foo", null])", &options);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringNoOptions) {
+  Datum input = ArrayFromJSON(this->type(), "[]");
+  ASSERT_RAISES(Invalid, CallFunction("replace_substring", {input}));
+}
+
+#ifdef ARROW_WITH_RE2
+TYPED_TEST(TestStringKernels, ReplaceSubstringRegex) {
+  ReplaceSubstringOptions options_regex{"(fo+)\\s*", "\\1-bazz"};
+  this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that foo", null])",
+                   this->type(), R"(["foo-bazz", "this foo-bazzthat foo-bazz", null])",
+                   &options_regex);
+  // make sure we match non-overlapping
+  ReplaceSubstringOptions options_regex2{"(a.a)", "aba\\1"};
+  this->CheckUnary("replace_substring_regex", R"(["aaaaaa"])", this->type(),
+                   R"(["abaaaaabaaaa"])", &options_regex2);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringRegexLimited) {
+  // With a finite number of replacements
+  ReplaceSubstringOptions options1{"foo", "bazz", 1};
+  this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
+                   this->type(), R"(["bazz", "this bazz that foo", null])", &options1);
+  ReplaceSubstringOptions options_regex1{"(fo+)\\s*", "\\1-bazz", 1};
+  this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that foo", null])",
+                   this->type(), R"(["foo-bazz", "this foo-bazzthat foo", null])",
+                   &options_regex1);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringRegexNoOptions) {
+  Datum input = ArrayFromJSON(this->type(), "[]");
+  ASSERT_RAISES(Invalid, CallFunction("replace_substring_regex", {input}));
+}
+#endif
+
 TYPED_TEST(TestStringKernels, Strptime) {
   std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 9385c5c2a16..3a8a3a0eb85 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -126,18 +126,19 @@ static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
   RegisterScalarValidity(registry.get());
   RegisterScalarFillNull(registry.get());
 
+  // Vector functions
+  RegisterVectorHash(registry.get());
+  RegisterVectorSelection(registry.get());
+  RegisterVectorNested(registry.get());
+  RegisterVectorSort(registry.get());
+
   // Aggregate functions
   RegisterScalarAggregateBasic(registry.get());
   RegisterScalarAggregateMode(registry.get());
   RegisterScalarAggregateQuantile(registry.get());
   RegisterScalarAggregateTDigest(registry.get());
   RegisterScalarAggregateVariance(registry.get());
-
-  // Vector functions
-  RegisterVectorHash(registry.get());
-  RegisterVectorSelection(registry.get());
-  RegisterVectorNested(registry.get());
-  RegisterVectorSort(registry.get());
+  RegisterHashAggregateBasic(registry.get());
 
   return registry;
 }
diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h
index 3b0f4475328..e4008cf3f27 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -47,6 +47,7 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry);
 void RegisterScalarAggregateQuantile(FunctionRegistry* registry);
 void RegisterScalarAggregateTDigest(FunctionRegistry* registry);
 void RegisterScalarAggregateVariance(FunctionRegistry* registry);
+void RegisterHashAggregateBasic(FunctionRegistry* registry);
 
 }  // namespace internal
 }  // namespace compute
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index cf66b47e585..4eaed420e7f 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -100,7 +100,7 @@ class CSVBufferIterator {
       AsyncGenerator<std::shared_ptr<Buffer>> buffer_iterator) {
     Transformer<std::shared_ptr<Buffer>, std::shared_ptr<Buffer>> fn =
         CSVBufferIterator();
-    return MakeAsyncGenerator(std::move(buffer_iterator), fn);
+    return MakeTransformedGenerator(std::move(buffer_iterator), fn);
   }
 
   Result<TransformFlow<std::shared_ptr<Buffer>>> operator()(std::shared_ptr<Buffer> buf) {
@@ -154,19 +154,12 @@ struct CSVBlock {
 template <>
 struct IterationTraits<csv::CSVBlock> {
   static csv::CSVBlock End() { return csv::CSVBlock{{}, {}, {}, -1, true, {}}; }
+  static bool IsEnd(const csv::CSVBlock& val) { return val.block_index < 0; }
 };
 
 namespace csv {
 namespace {
 
-// The == operator must be defined to be used as T in Iterator<T>
-bool operator==(const CSVBlock& left, const CSVBlock& right) {
-  return left.block_index == right.block_index;
-}
-bool operator!=(const CSVBlock& left, const CSVBlock& right) {
-  return left.block_index != right.block_index;
-}
-
 // This is a callable that can be used to transform an iterator.  The source iterator
 // will contain buffers of data and the output iterator will contain delimited CSV
 // blocks.  util::optional is used so that there is an end token (required by the
@@ -206,6 +199,19 @@ class SerialBlockReader : public BlockReader {
     return MakeTransformedIterator(std::move(buffer_iterator), block_reader_fn);
   }
 
+  static AsyncGenerator<CSVBlock> MakeAsyncIterator(
+      AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator,
+      std::unique_ptr<Chunker> chunker, std::shared_ptr<Buffer> first_buffer) {
+    auto block_reader =
+        std::make_shared<SerialBlockReader>(std::move(chunker), first_buffer);
+    // Wrap shared pointer in callable
+    Transformer<std::shared_ptr<Buffer>, CSVBlock> block_reader_fn =
+        [block_reader](std::shared_ptr<Buffer> next) {
+          return (*block_reader)(std::move(next));
+        };
+    return MakeTransformedGenerator(std::move(buffer_generator), block_reader_fn);
+  }
+
   Result<TransformFlow<CSVBlock>> operator()(std::shared_ptr<Buffer> next_buffer) {
     if (buffer_ == nullptr) {
       return TransformFinish();
@@ -267,7 +273,7 @@ class ThreadedBlockReader : public BlockReader {
     // Wrap shared pointer in callable
     Transformer<std::shared_ptr<Buffer>, CSVBlock> block_reader_fn =
         [block_reader](std::shared_ptr<Buffer> next) { return (*block_reader)(next); };
-    return MakeAsyncGenerator(std::move(buffer_generator), block_reader_fn);
+    return MakeTransformedGenerator(std::move(buffer_generator), block_reader_fn);
   }
 
   Result<TransformFlow<CSVBlock>> operator()(std::shared_ptr<Buffer> next_buffer) {
@@ -311,15 +317,14 @@ class ThreadedBlockReader : public BlockReader {
 
 class ReaderMixin {
  public:
-  ReaderMixin(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
+  ReaderMixin(io::IOContext io_context, std::shared_ptr<io::InputStream> input,
               const ReadOptions& read_options, const ParseOptions& parse_options,
-              const ConvertOptions& convert_options, StopToken stop_token)
-      : pool_(pool),
+              const ConvertOptions& convert_options)
+      : io_context_(std::move(io_context)),
         read_options_(read_options),
         parse_options_(parse_options),
         convert_options_(convert_options),
-        input_(std::move(input)),
-        stop_token_(std::move(stop_token)) {}
+        input_(std::move(input)) {}
 
  protected:
   // Read header and column names from buffer, create column builders
@@ -343,7 +348,7 @@ class ReaderMixin {
 
     if (read_options_.column_names.empty()) {
       // Parse one row (either to read column names or to know the number of columns)
-      BlockParser parser(pool_, parse_options_, num_csv_cols_, 1);
+      BlockParser parser(io_context_.pool(), parse_options_, num_csv_cols_, 1);
       uint32_t parsed_size = 0;
       RETURN_NOT_OK(parser.Parse(
           util::string_view(reinterpret_cast<const char*>(data), data_end - data),
@@ -461,8 +466,8 @@ class ReaderMixin {
                             const std::shared_ptr<Buffer>& block, int64_t block_index,
                             bool is_final) {
     static constexpr int32_t max_num_rows = std::numeric_limits<int32_t>::max();
-    auto parser =
-        std::make_shared<BlockParser>(pool_, parse_options_, num_csv_cols_, max_num_rows);
+    auto parser = std::make_shared<BlockParser>(io_context_.pool(), parse_options_,
+                                                num_csv_cols_, max_num_rows);
 
     std::shared_ptr<Buffer> straddling;
     std::vector<util::string_view> views;
@@ -472,8 +477,8 @@ class ReaderMixin {
       } else if (completion->size() == 0) {
         straddling = partial;
       } else {
-        ARROW_ASSIGN_OR_RAISE(straddling,
-                              ConcatenateBuffers({partial, completion}, pool_));
+        ARROW_ASSIGN_OR_RAISE(
+            straddling, ConcatenateBuffers({partial, completion}, io_context_.pool()));
       }
       views = {util::string_view(*straddling), util::string_view(*block)};
     } else {
@@ -488,7 +493,7 @@ class ReaderMixin {
     return ParseResult{std::move(parser), static_cast<int64_t>(parsed_size)};
   }
 
-  MemoryPool* pool_;
+  io::IOContext io_context_;
   ReadOptions read_options_;
   ParseOptions parse_options_;
   ConvertOptions convert_options_;
@@ -501,7 +506,6 @@ class ReaderMixin {
 
   std::shared_ptr<io::InputStream> input_;
   std::shared_ptr<internal::TaskGroup> task_group_;
-  StopToken stop_token_;
 };
 
 /////////////////////////////////////////////////////////////////////////
@@ -523,16 +527,16 @@ class BaseTableReader : public ReaderMixin, public csv::TableReader {
     for (const auto& column : conversion_schema_.columns) {
       std::shared_ptr<ColumnBuilder> builder;
       if (column.is_missing) {
-        ARROW_ASSIGN_OR_RAISE(builder,
-                              ColumnBuilder::MakeNull(pool_, column.type, task_group_));
+        ARROW_ASSIGN_OR_RAISE(builder, ColumnBuilder::MakeNull(io_context_.pool(),
+                                                               column.type, task_group_));
       } else if (column.type != nullptr) {
+        ARROW_ASSIGN_OR_RAISE(
+            builder, ColumnBuilder::Make(io_context_.pool(), column.type, column.index,
+                                         convert_options_, task_group_));
+      } else {
         ARROW_ASSIGN_OR_RAISE(builder,
-                              ColumnBuilder::Make(pool_, column.type, column.index,
+                              ColumnBuilder::Make(io_context_.pool(), column.index,
                                                   convert_options_, task_group_));
-      } else {
-        ARROW_ASSIGN_OR_RAISE(
-            builder,
-            ColumnBuilder::Make(pool_, column.index, convert_options_, task_group_));
       }
       column_builders_.push_back(std::move(builder));
     }
@@ -581,37 +585,40 @@ class BaseTableReader : public ReaderMixin, public csv::TableReader {
 
 class BaseStreamingReader : public ReaderMixin, public csv::StreamingReader {
  public:
-  using ReaderMixin::ReaderMixin;
+  BaseStreamingReader(io::IOContext io_context, Executor* cpu_executor,
+                      std::shared_ptr<io::InputStream> input,
+                      const ReadOptions& read_options, const ParseOptions& parse_options,
+                      const ConvertOptions& convert_options)
+      : ReaderMixin(io_context, std::move(input), read_options, parse_options,
+                    convert_options),
+        cpu_executor_(cpu_executor) {}
 
-  virtual Status Init() = 0;
+  virtual Future<std::shared_ptr<csv::StreamingReader>> Init() = 0;
 
   std::shared_ptr<Schema> schema() const override { return schema_; }
 
   Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
-    do {
-      RETURN_NOT_OK(ReadNext().Value(batch));
-    } while (*batch != nullptr && (*batch)->num_rows() == 0);
-    return Status::OK();
+    auto next_fut = ReadNextAsync();
+    auto next_result = next_fut.result();
+    return std::move(next_result).Value(batch);
   }
 
  protected:
-  virtual Result<std::shared_ptr<RecordBatch>> ReadNext() = 0;
-
   // Make column decoders from conversion schema
   Status MakeColumnDecoders() {
     for (const auto& column : conversion_schema_.columns) {
       std::shared_ptr<ColumnDecoder> decoder;
       if (column.is_missing) {
-        ARROW_ASSIGN_OR_RAISE(decoder,
-                              ColumnDecoder::MakeNull(pool_, column.type, task_group_));
+        ARROW_ASSIGN_OR_RAISE(decoder, ColumnDecoder::MakeNull(io_context_.pool(),
+                                                               column.type, task_group_));
       } else if (column.type != nullptr) {
+        ARROW_ASSIGN_OR_RAISE(
+            decoder, ColumnDecoder::Make(io_context_.pool(), column.type, column.index,
+                                         convert_options_, task_group_));
+      } else {
         ARROW_ASSIGN_OR_RAISE(decoder,
-                              ColumnDecoder::Make(pool_, column.type, column.index,
+                              ColumnDecoder::Make(io_context_.pool(), column.index,
                                                   convert_options_, task_group_));
-      } else {
-        ARROW_ASSIGN_OR_RAISE(
-            decoder,
-            ColumnDecoder::Make(pool_, column.index, convert_options_, task_group_));
       }
       column_decoders_.push_back(std::move(decoder));
     }
@@ -679,101 +686,144 @@ class BaseStreamingReader : public ReaderMixin, public csv::StreamingReader {
   std::vector<std::shared_ptr<ColumnDecoder>> column_decoders_;
   std::shared_ptr<Schema> schema_;
   std::shared_ptr<RecordBatch> pending_batch_;
-  Iterator<std::shared_ptr<Buffer>> buffer_iterator_;
+  AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator_;
+  Executor* cpu_executor_;
   bool eof_ = false;
 };
 
 /////////////////////////////////////////////////////////////////////////
 // Serial StreamingReader implementation
 
-class SerialStreamingReader : public BaseStreamingReader {
+class SerialStreamingReader : public BaseStreamingReader,
+                              public std::enable_shared_from_this<SerialStreamingReader> {
  public:
   using BaseStreamingReader::BaseStreamingReader;
 
-  Status Init() override {
+  Future<std::shared_ptr<csv::StreamingReader>> Init() override {
     ARROW_ASSIGN_OR_RAISE(auto istream_it,
                           io::MakeInputStreamIterator(input_, read_options_.block_size));
 
-    // Since we're converting serially, no need to readahead more than one block
-    int32_t block_queue_size = 1;
-    ARROW_ASSIGN_OR_RAISE(auto rh_it,
-                          MakeReadaheadIterator(std::move(istream_it), block_queue_size));
-    buffer_iterator_ = CSVBufferIterator::Make(std::move(rh_it));
-    task_group_ = internal::TaskGroup::MakeSerial(stop_token_);
+    ARROW_ASSIGN_OR_RAISE(auto bg_it, MakeBackgroundGenerator(std::move(istream_it),
+                                                              io_context_.executor()));
 
+    // TODO Consider exposing readahead as a read option (ARROW-12090)
+    auto rh_it =
+        MakeSerialReadaheadGenerator(std::move(bg_it), cpu_executor_->GetCapacity());
+
+    auto transferred_it = MakeTransferredGenerator(rh_it, cpu_executor_);
+
+    buffer_generator_ = CSVBufferIterator::MakeAsync(std::move(transferred_it));
+    task_group_ = internal::TaskGroup::MakeSerial(io_context_.stop_token());
+
+    auto self = shared_from_this();
     // Read schema from first batch
-    ARROW_ASSIGN_OR_RAISE(pending_batch_, ReadNext());
-    DCHECK_NE(schema_, nullptr);
-    return Status::OK();
+    return ReadNextAsync().Then([self](const std::shared_ptr<RecordBatch>& first_batch)
+                                    -> Result<std::shared_ptr<csv::StreamingReader>> {
+      self->pending_batch_ = first_batch;
+      DCHECK_NE(self->schema_, nullptr);
+      return self;
+    });
   }
 
- protected:
-  Result<std::shared_ptr<RecordBatch>> ReadNext() override {
-    if (eof_) {
-      return nullptr;
-    }
-    if (stop_token_.IsStopRequested()) {
-      eof_ = true;
-      return stop_token_.Poll();
-    }
-    if (!block_iterator_) {
-      Status st = SetupReader();
-      if (!st.ok()) {
-        // Can't setup reader => bail out
-        eof_ = true;
-        return st;
-      }
+  Result<std::shared_ptr<RecordBatch>> DecodeBatchAndUpdateSchema() {
+    auto maybe_batch = DecodeNextBatch();
+    if (schema_ == nullptr && maybe_batch.ok()) {
+      schema_ = (*maybe_batch)->schema();
     }
+    return maybe_batch;
+  }
+
+  Future<std::shared_ptr<RecordBatch>> DoReadNext(
+      std::shared_ptr<SerialStreamingReader> self) {
     auto batch = std::move(pending_batch_);
     if (batch != nullptr) {
-      return batch;
+      return Future<std::shared_ptr<RecordBatch>>::MakeFinished(batch);
     }
 
     if (!source_eof_) {
-      ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator_.Next());
-      if (maybe_block != IterationTraits<CSVBlock>::End()) {
-        last_block_index_ = maybe_block.block_index;
-        auto maybe_parsed = ParseAndInsert(maybe_block.partial, maybe_block.completion,
-                                           maybe_block.buffer, maybe_block.block_index,
-                                           maybe_block.is_final);
-        if (!maybe_parsed.ok()) {
-          // Parse error => bail out
-          eof_ = true;
-          return maybe_parsed.status();
-        }
-        RETURN_NOT_OK(maybe_block.consume_bytes(*maybe_parsed));
-      } else {
-        source_eof_ = true;
-        for (auto& decoder : column_decoders_) {
-          decoder->SetEOF(last_block_index_ + 1);
-        }
-      }
+      return block_generator_()
+          .Then([self](const CSVBlock& maybe_block) -> Status {
+            if (!IsIterationEnd(maybe_block)) {
+              self->last_block_index_ = maybe_block.block_index;
+              auto maybe_parsed = self->ParseAndInsert(
+                  maybe_block.partial, maybe_block.completion, maybe_block.buffer,
+                  maybe_block.block_index, maybe_block.is_final);
+              if (!maybe_parsed.ok()) {
+                // Parse error => bail out
+                self->eof_ = true;
+                return maybe_parsed.status();
+              }
+              RETURN_NOT_OK(maybe_block.consume_bytes(*maybe_parsed));
+            } else {
+              self->source_eof_ = true;
+              for (auto& decoder : self->column_decoders_) {
+                decoder->SetEOF(self->last_block_index_ + 1);
+              }
+            }
+            return Status::OK();
+          })
+          .Then([self](const ::arrow::detail::Empty& st)
+                    -> Result<std::shared_ptr<RecordBatch>> {
+            return self->DecodeBatchAndUpdateSchema();
+          });
     }
+    return Future<std::shared_ptr<RecordBatch>>::MakeFinished(
+        DecodeBatchAndUpdateSchema());
+  }
 
-    auto maybe_batch = DecodeNextBatch();
-    if (schema_ == nullptr && maybe_batch.ok()) {
-      schema_ = (*maybe_batch)->schema();
+  Future<std::shared_ptr<RecordBatch>> ReadNextSkippingEmpty(
+      std::shared_ptr<SerialStreamingReader> self) {
+    return DoReadNext(self).Then([self](const std::shared_ptr<RecordBatch>& batch) {
+      if (batch != nullptr && batch->num_rows() == 0) {
+        return self->ReadNextSkippingEmpty(self);
+      }
+      return Future<std::shared_ptr<RecordBatch>>::MakeFinished(batch);
+    });
+  }
+
+  Future<std::shared_ptr<RecordBatch>> ReadNextAsync() override {
+    if (eof_) {
+      return Future<std::shared_ptr<RecordBatch>>::MakeFinished(nullptr);
+    }
+    if (io_context_.stop_token().IsStopRequested()) {
+      eof_ = true;
+      return io_context_.stop_token().Poll();
+    }
+    auto self = shared_from_this();
+    if (!block_generator_) {
+      return SetupReader(self).Then([self](const Result<::arrow::detail::Empty>& res)
+                                        -> Future<std::shared_ptr<RecordBatch>> {
+        if (!res.ok()) {
+          self->eof_ = true;
+          return res.status();
+        }
+        return self->ReadNextSkippingEmpty(self);
+      });
+    } else {
+      return self->ReadNextSkippingEmpty(self);
     }
-    return maybe_batch;
   };
 
-  Status SetupReader() {
-    ARROW_ASSIGN_OR_RAISE(auto first_buffer, buffer_iterator_.Next());
-    if (first_buffer == nullptr) {
-      return Status::Invalid("Empty CSV file");
-    }
-    RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer));
-    RETURN_NOT_OK(MakeColumnDecoders());
+ protected:
+  Future<> SetupReader(std::shared_ptr<SerialStreamingReader> self) {
+    return buffer_generator_().Then([self](const std::shared_ptr<Buffer>& first_buffer) {
+      if (first_buffer == nullptr) {
+        return Status::Invalid("Empty CSV file");
+      }
+      auto own_first_buffer = first_buffer;
+      RETURN_NOT_OK(self->ProcessHeader(own_first_buffer, &own_first_buffer));
+      RETURN_NOT_OK(self->MakeColumnDecoders());
 
-    block_iterator_ = SerialBlockReader::MakeIterator(std::move(buffer_iterator_),
-                                                      MakeChunker(parse_options_),
-                                                      std::move(first_buffer));
-    return Status::OK();
+      self->block_generator_ = SerialBlockReader::MakeAsyncIterator(
+          std::move(self->buffer_generator_), MakeChunker(self->parse_options_),
+          std::move(own_first_buffer));
+      return Status::OK();
+    });
   }
 
   bool source_eof_ = false;
   int64_t last_block_index_ = 0;
-  Iterator<CSVBlock> block_iterator_;
+  AsyncGenerator<CSVBlock> block_generator_;
 };
 
 /////////////////////////////////////////////////////////////////////////
@@ -796,7 +846,7 @@ class SerialTableReader : public BaseTableReader {
   }
 
   Result<std::shared_ptr<Table>> Read() override {
-    task_group_ = internal::TaskGroup::MakeSerial(stop_token_);
+    task_group_ = internal::TaskGroup::MakeSerial(io_context_.stop_token());
 
     // First block
     ARROW_ASSIGN_OR_RAISE(auto first_buffer, buffer_iterator_.Next());
@@ -810,10 +860,10 @@ class SerialTableReader : public BaseTableReader {
                                                           MakeChunker(parse_options_),
                                                           std::move(first_buffer));
     while (true) {
-      RETURN_NOT_OK(stop_token_.Poll());
+      RETURN_NOT_OK(io_context_.stop_token().Poll());
 
       ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator.Next());
-      if (maybe_block == IterationTraits<CSVBlock>::End()) {
+      if (IsIterationEnd(maybe_block)) {
         // EOF
         break;
       }
@@ -838,15 +888,14 @@ class AsyncThreadedTableReader
  public:
   using BaseTableReader::BaseTableReader;
 
-  AsyncThreadedTableReader(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
+  AsyncThreadedTableReader(io::IOContext io_context,
+                           std::shared_ptr<io::InputStream> input,
                            const ReadOptions& read_options,
                            const ParseOptions& parse_options,
-                           const ConvertOptions& convert_options, StopToken stop_token,
-                           Executor* cpu_executor, Executor* io_executor)
-      : BaseTableReader(pool, input, read_options, parse_options, convert_options,
-                        std::move(stop_token)),
-        cpu_executor_(cpu_executor),
-        io_executor_(io_executor) {}
+                           const ConvertOptions& convert_options, Executor* cpu_executor)
+      : BaseTableReader(std::move(io_context), input, read_options, parse_options,
+                        convert_options),
+        cpu_executor_(cpu_executor) {}
 
   ~AsyncThreadedTableReader() override {
     if (task_group_) {
@@ -860,12 +909,12 @@ class AsyncThreadedTableReader
     ARROW_ASSIGN_OR_RAISE(auto istream_it,
                           io::MakeInputStreamIterator(input_, read_options_.block_size));
 
-    ARROW_ASSIGN_OR_RAISE(auto bg_it,
-                          MakeBackgroundGenerator(std::move(istream_it), io_executor_));
+    ARROW_ASSIGN_OR_RAISE(auto bg_it, MakeBackgroundGenerator(std::move(istream_it),
+                                                              io_context_.executor()));
 
     auto transferred_it = MakeTransferredGenerator(bg_it, cpu_executor_);
 
-    int32_t block_queue_size = std::max(2, cpu_executor_->GetCapacity());
+    int32_t block_queue_size = cpu_executor_->GetCapacity();
     auto rh_it =
         MakeSerialReadaheadGenerator(std::move(transferred_it), block_queue_size);
     buffer_generator_ = CSVBufferIterator::MakeAsync(std::move(rh_it));
@@ -875,7 +924,8 @@ class AsyncThreadedTableReader
   Result<std::shared_ptr<Table>> Read() override { return ReadAsync().result(); }
 
   Future<std::shared_ptr<Table>> ReadAsync() override {
-    task_group_ = internal::TaskGroup::MakeThreaded(cpu_executor_, stop_token_);
+    task_group_ =
+        internal::TaskGroup::MakeThreaded(cpu_executor_, io_context_.stop_token());
 
     auto self = shared_from_this();
     return ProcessFirstBuffer().Then([self](std::shared_ptr<Buffer> first_buffer) {
@@ -932,7 +982,6 @@ class AsyncThreadedTableReader
   }
 
   Executor* cpu_executor_;
-  Executor* io_executor_;
   AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator_;
 };
 
@@ -943,29 +992,25 @@ Result<std::shared_ptr<TableReader>> MakeTableReader(
   std::shared_ptr<BaseTableReader> reader;
   if (read_options.use_threads) {
     auto cpu_executor = internal::GetCpuThreadPool();
-    auto io_executor = io_context.executor();
     reader = std::make_shared<AsyncThreadedTableReader>(
-        pool, input, read_options, parse_options, convert_options,
-        io_context.stop_token(), cpu_executor, io_executor);
+        io_context, input, read_options, parse_options, convert_options, cpu_executor);
   } else {
-    reader =
-        std::make_shared<SerialTableReader>(pool, input, read_options, parse_options,
-                                            convert_options, io_context.stop_token());
+    reader = std::make_shared<SerialTableReader>(io_context, input, read_options,
+                                                 parse_options, convert_options);
   }
   RETURN_NOT_OK(reader->Init());
   return reader;
 }
 
-Result<std::shared_ptr<StreamingReader>> MakeStreamingReader(
+Future<std::shared_ptr<StreamingReader>> MakeStreamingReader(
     io::IOContext io_context, std::shared_ptr<io::InputStream> input,
     const ReadOptions& read_options, const ParseOptions& parse_options,
     const ConvertOptions& convert_options) {
+  auto cpu_executor = internal::GetCpuThreadPool();
   std::shared_ptr<BaseStreamingReader> reader;
-  reader = std::make_shared<SerialStreamingReader>(io_context.pool(), input, read_options,
-                                                   parse_options, convert_options,
-                                                   io_context.stop_token());
-  RETURN_NOT_OK(reader->Init());
-  return reader;
+  reader = std::make_shared<SerialStreamingReader>(
+      io_context, cpu_executor, input, read_options, parse_options, convert_options);
+  return reader->Init();
 }
 
 }  // namespace
@@ -993,17 +1038,33 @@ Result<std::shared_ptr<StreamingReader>> StreamingReader::Make(
     MemoryPool* pool, std::shared_ptr<io::InputStream> input,
     const ReadOptions& read_options, const ParseOptions& parse_options,
     const ConvertOptions& convert_options) {
-  return MakeStreamingReader(io::IOContext(pool), std::move(input), read_options,
-                             parse_options, convert_options);
+  auto io_context = io::IOContext(pool);
+  auto reader_fut = MakeStreamingReader(io_context, std::move(input), read_options,
+                                        parse_options, convert_options);
+  auto reader_result = reader_fut.result();
+  ARROW_ASSIGN_OR_RAISE(auto reader, reader_result);
+  return reader;
 }
 
 Result<std::shared_ptr<StreamingReader>> StreamingReader::Make(
     io::IOContext io_context, std::shared_ptr<io::InputStream> input,
     const ReadOptions& read_options, const ParseOptions& parse_options,
     const ConvertOptions& convert_options) {
+  auto reader_fut = MakeStreamingReader(io_context, std::move(input), read_options,
+                                        parse_options, convert_options);
+  auto reader_result = reader_fut.result();
+  ARROW_ASSIGN_OR_RAISE(auto reader, reader_result);
+  return reader;
+}
+
+Future<std::shared_ptr<StreamingReader>> StreamingReader::MakeAsync(
+    io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+    const ReadOptions& read_options, const ParseOptions& parse_options,
+    const ConvertOptions& convert_options) {
   return MakeStreamingReader(io_context, std::move(input), read_options, parse_options,
                              convert_options);
 }
 
 }  // namespace csv
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h
index 76a575b3349..79015e941ee 100644
--- a/cpp/src/arrow/csv/reader.h
+++ b/cpp/src/arrow/csv/reader.h
@@ -65,16 +65,24 @@ class ARROW_EXPORT StreamingReader : public RecordBatchReader {
 
   /// Create a StreamingReader instance
   ///
-  /// Currently, the StreamingReader is always single-threaded (parallel
-  /// readahead is not supported).
+  /// This involves some I/O as the first batch must be loaded during the creation process
+  /// so it is returned as a future
+  ///
+  /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out
+  /// parsing (see ARROW-11889)
+  static Future<std::shared_ptr<StreamingReader>> MakeAsync(
+      io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+      const ReadOptions&, const ParseOptions&, const ConvertOptions&);
+
   static Result<std::shared_ptr<StreamingReader>> Make(
       io::IOContext io_context, std::shared_ptr<io::InputStream> input,
       const ReadOptions&, const ParseOptions&, const ConvertOptions&);
 
   ARROW_DEPRECATED("Use IOContext-based overload")
   static Result<std::shared_ptr<StreamingReader>> Make(
-      MemoryPool* pool, std::shared_ptr<io::InputStream> input, const ReadOptions&,
-      const ParseOptions&, const ConvertOptions&);
+      MemoryPool* pool, std::shared_ptr<io::InputStream> input,
+      const ReadOptions& read_options, const ParseOptions& parse_options,
+      const ConvertOptions& convert_options);
 };
 
 }  // namespace csv
diff --git a/cpp/src/arrow/csv/reader_test.cc b/cpp/src/arrow/csv/reader_test.cc
index 2293463cd19..c869ec2e1a3 100644
--- a/cpp/src/arrow/csv/reader_test.cc
+++ b/cpp/src/arrow/csv/reader_test.cc
@@ -32,12 +32,40 @@
 #include "arrow/table.h"
 #include "arrow/testing/future_util.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/async_generator.h"
 #include "arrow/util/future.h"
 #include "arrow/util/thread_pool.h"
 
 namespace arrow {
+
+using RecordBatchGenerator = AsyncGenerator<std::shared_ptr<RecordBatch>>;
+
 namespace csv {
 
+// Allows the streaming reader to be used in tests that expect a table reader
+class StreamingReaderAsTableReader : public TableReader {
+ public:
+  explicit StreamingReaderAsTableReader(std::shared_ptr<StreamingReader> reader)
+      : reader_(std::move(reader)) {}
+  virtual ~StreamingReaderAsTableReader() = default;
+  virtual Result<std::shared_ptr<Table>> Read() {
+    auto table_fut = ReadAsync();
+    auto table_res = table_fut.result();
+    ARROW_ASSIGN_OR_RAISE(auto table, table_res);
+    return table;
+  }
+  virtual Future<std::shared_ptr<Table>> ReadAsync() {
+    auto reader = reader_;
+    RecordBatchGenerator rb_generator = [reader]() { return reader->ReadNextAsync(); };
+    return CollectAsyncGenerator(rb_generator).Then([](const RecordBatchVector& rbs) {
+      return Table::FromRecordBatches(rbs);
+    });
+  }
+
+ private:
+  std::shared_ptr<StreamingReader> reader_;
+};
+
 using TableReaderFactory =
     std::function<Result<std::shared_ptr<TableReader>>(std::shared_ptr<io::InputStream>)>;
 
@@ -152,5 +180,32 @@ TEST(AsyncReaderTests, NestedParallelism) {
   TestNestedParallelism(thread_pool, table_factory);
 }
 
+Result<TableReaderFactory> MakeStreamingFactory() {
+  return [](std::shared_ptr<io::InputStream> input_stream)
+             -> Result<std::shared_ptr<TableReader>> {
+    auto read_options = ReadOptions::Defaults();
+    read_options.block_size = 1 << 10;
+    ARROW_ASSIGN_OR_RAISE(
+        auto streaming_reader,
+        StreamingReader::Make(io::default_io_context(), input_stream, read_options,
+                              ParseOptions::Defaults(), ConvertOptions::Defaults()));
+    return std::make_shared<StreamingReaderAsTableReader>(std::move(streaming_reader));
+  };
+}
+
+TEST(StreamingReaderTests, Stress) {
+  ASSERT_OK_AND_ASSIGN(auto table_factory, MakeStreamingFactory());
+  StressTableReader(table_factory);
+}
+TEST(StreamingReaderTests, StressInvalid) {
+  ASSERT_OK_AND_ASSIGN(auto table_factory, MakeStreamingFactory());
+  StressInvalidTableReader(table_factory);
+}
+TEST(StreamingReaderTests, NestedParallelism) {
+  ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(1));
+  ASSERT_OK_AND_ASSIGN(auto table_factory, MakeStreamingFactory());
+  TestNestedParallelism(thread_pool, table_factory);
+}
+
 }  // namespace csv
 }  // namespace arrow
diff --git a/cpp/src/arrow/dataset/dataset_internal.h b/cpp/src/arrow/dataset/dataset_internal.h
index 331ad3d81c6..b28bf7a14a4 100644
--- a/cpp/src/arrow/dataset/dataset_internal.h
+++ b/cpp/src/arrow/dataset/dataset_internal.h
@@ -185,5 +185,27 @@ inline bool operator==(const SubtreeImpl::Encoded& l, const SubtreeImpl::Encoded
          l.partition_expression == r.partition_expression;
 }
 
+/// Get fragment scan options of the expected type.
+/// \return Fragment scan options if provided on the scan options, else the default
+///     options if set, else a default-constructed value. If options are provided
+///     but of the wrong type, an error is returned.
+template <typename T>
+arrow::Result<std::shared_ptr<T>> GetFragmentScanOptions(
+    const std::string& type_name, ScanOptions* scan_options,
+    const std::shared_ptr<FragmentScanOptions>& default_options) {
+  auto source = default_options;
+  if (scan_options && scan_options->fragment_scan_options) {
+    source = scan_options->fragment_scan_options;
+  }
+  if (!source) {
+    return std::make_shared<T>();
+  }
+  if (source->type_name() != type_name) {
+    return Status::Invalid("FragmentScanOptions of type ", source->type_name(),
+                           " were provided for scanning a fragment of type ", type_name);
+  }
+  return internal::checked_pointer_cast<T>(source);
+}
+
 }  // namespace dataset
 }  // namespace arrow
diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index 6e71aa17e74..627477b3038 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -284,7 +284,7 @@ bool Identical(const Expression& l, const Expression& r) { return l.impl_ == r.i
 size_t Expression::hash() const {
   if (auto lit = literal()) {
     if (lit->is_scalar()) {
-      return Scalar::Hash::hash(*lit->scalar());
+      return lit->scalar()->hash();
     }
     return 0;
   }
diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
index ca77f43eded..9c613c00aff 100644
--- a/cpp/src/arrow/dataset/file_base.h
+++ b/cpp/src/arrow/dataset/file_base.h
@@ -124,6 +124,11 @@ class ARROW_DS_EXPORT FileSource {
 /// \brief Base class for file format implementation
 class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileFormat> {
  public:
+  /// Options affecting how this format is scanned.
+  ///
+  /// The options here can be overridden at scan time.
+  std::shared_ptr<FragmentScanOptions> default_fragment_scan_options;
+
   virtual ~FileFormat() = default;
 
   /// \brief The name identifying the kind of file format
diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
index b7c7f3290da..e736d06753b 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -82,14 +82,11 @@ static inline Result<csv::ConvertOptions> GetConvertOptions(
   ARROW_ASSIGN_OR_RAISE(auto column_names,
                         GetColumnNames(format.parse_options, first_block, pool));
 
-  auto convert_options = csv::ConvertOptions::Defaults();
-  if (scan_options && scan_options->fragment_scan_options &&
-      scan_options->fragment_scan_options->type_name() == kCsvTypeName) {
-    auto csv_scan_options = internal::checked_pointer_cast<CsvFragmentScanOptions>(
-        scan_options->fragment_scan_options);
-    convert_options = csv_scan_options->convert_options;
-  }
-
+  ARROW_ASSIGN_OR_RAISE(
+      auto csv_scan_options,
+      GetFragmentScanOptions<CsvFragmentScanOptions>(
+          kCsvTypeName, scan_options.get(), format.default_fragment_scan_options));
+  auto convert_options = csv_scan_options->convert_options;
   for (FieldRef ref : scan_options->MaterializedFields()) {
     ARROW_ASSIGN_OR_RAISE(auto field, ref.GetOne(*scan_options->dataset_schema));
 
@@ -99,8 +96,13 @@ static inline Result<csv::ConvertOptions> GetConvertOptions(
   return convert_options;
 }
 
-static inline csv::ReadOptions GetReadOptions(const CsvFileFormat& format) {
-  auto read_options = csv::ReadOptions::Defaults();
+static inline Result<csv::ReadOptions> GetReadOptions(
+    const CsvFileFormat& format, const std::shared_ptr<ScanOptions>& scan_options) {
+  ARROW_ASSIGN_OR_RAISE(
+      auto csv_scan_options,
+      GetFragmentScanOptions<CsvFragmentScanOptions>(
+          kCsvTypeName, scan_options.get(), format.default_fragment_scan_options));
+  auto read_options = csv_scan_options->read_options;
   // Multithreaded conversion of individual files would lead to excessive thread
   // contention when ScanTasks are also executed in multiple threads, so we disable it
   // here.
@@ -112,7 +114,7 @@ static inline Result<std::shared_ptr<csv::StreamingReader>> OpenReader(
     const FileSource& source, const CsvFileFormat& format,
     const std::shared_ptr<ScanOptions>& scan_options = nullptr,
     MemoryPool* pool = default_memory_pool()) {
-  auto reader_options = GetReadOptions(format);
+  ARROW_ASSIGN_OR_RAISE(auto reader_options, GetReadOptions(format, scan_options));
 
   util::string_view first_block;
   ARROW_ASSIGN_OR_RAISE(auto input, source.OpenCompressed());
diff --git a/cpp/src/arrow/dataset/file_csv.h b/cpp/src/arrow/dataset/file_csv.h
index 4c66e291a47..b235195c5e3 100644
--- a/cpp/src/arrow/dataset/file_csv.h
+++ b/cpp/src/arrow/dataset/file_csv.h
@@ -62,11 +62,17 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
   std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override { return NULLPTR; }
 };
 
-class ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
- public:
+/// \brief Per-scan options for CSV fragments
+struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
   std::string type_name() const override { return kCsvTypeName; }
 
+  /// CSV conversion options
   csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
+
+  /// CSV reading options
+  ///
+  /// Note that use_threads is always ignored.
+  csv::ReadOptions read_options = csv::ReadOptions::Defaults();
 };
 
 }  // namespace dataset
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index c3c8796e17e..99ca7cc0f42 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -134,6 +134,43 @@ bar)");
   ASSERT_EQ(null_count, 1);
 }
 
+TEST_P(TestCsvFileFormat, CustomReadOptions) {
+  auto source = GetFileSource(R"(header_skipped
+str
+foo
+MYNULL
+N/A
+bar)");
+  SetSchema({field("str", utf8())});
+  auto defaults = std::make_shared<CsvFragmentScanOptions>();
+  defaults->read_options.skip_rows = 1;
+  format_->default_fragment_scan_options = defaults;
+  ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source));
+  ASSERT_OK_AND_ASSIGN(auto physical_schema, fragment->ReadPhysicalSchema());
+  AssertSchemaEqual(opts_->dataset_schema, physical_schema);
+
+  {
+    int64_t rows = 0;
+    for (auto maybe_batch : Batches(fragment.get())) {
+      ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+      rows += batch->GetColumnByName("str")->length();
+    }
+    ASSERT_EQ(rows, 4);
+  }
+  {
+    // These options completely override the default ones
+    auto fragment_scan_options = std::make_shared<CsvFragmentScanOptions>();
+    fragment_scan_options->read_options.block_size = 1 << 22;
+    opts_->fragment_scan_options = fragment_scan_options;
+    int64_t rows = 0;
+    for (auto maybe_batch : Batches(fragment.get())) {
+      ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+      rows += batch->GetColumnByName("header_skipped")->length();
+    }
+    ASSERT_EQ(rows, 5);
+  }
+}
+
 TEST_P(TestCsvFileFormat, ScanRecordBatchReaderWithVirtualColumn) {
   auto source = GetFileSource(R"(f64
 1.0
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index ec4a28c8a0e..43ccd777cf2 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -26,6 +26,7 @@
 #include "arrow/array/array_dict.h"
 #include "arrow/array/array_nested.h"
 #include "arrow/array/builder_dict.h"
+#include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/api_scalar.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/cast.h"
@@ -70,56 +71,89 @@ std::shared_ptr<Partitioning> Partitioning::Default() {
   return std::make_shared<DefaultPartitioning>();
 }
 
-inline Expression ConjunctionFromGroupingRow(Scalar* row) {
-  ScalarVector* values = &checked_cast<StructScalar*>(row)->value;
-  std::vector<Expression> equality_expressions(values->size());
-  for (size_t i = 0; i < values->size(); ++i) {
-    const std::string& name = row->type->field(static_cast<int>(i))->name();
-    if (values->at(i)->is_valid) {
-      equality_expressions[i] = equal(field_ref(name), literal(std::move(values->at(i))));
-    } else {
-      equality_expressions[i] = is_null(field_ref(name));
-    }
+static Result<RecordBatchVector> ApplyGroupings(
+    const ListArray& groupings, const std::shared_ptr<RecordBatch>& batch) {
+  ARROW_ASSIGN_OR_RAISE(Datum sorted,
+                        compute::Take(batch, groupings.data()->child_data[0]));
+
+  const auto& sorted_batch = *sorted.record_batch();
+
+  RecordBatchVector out(static_cast<size_t>(groupings.length()));
+  for (size_t i = 0; i < out.size(); ++i) {
+    out[i] = sorted_batch.Slice(groupings.value_offset(i), groupings.value_length(i));
   }
-  return and_(std::move(equality_expressions));
+
+  return out;
 }
 
 Result<Partitioning::PartitionedBatches> KeyValuePartitioning::Partition(
     const std::shared_ptr<RecordBatch>& batch) const {
-  FieldVector by_fields;
-  ArrayVector by_columns;
+  std::vector<int> key_indices;
+  int num_keys = 0;
 
-  std::shared_ptr<RecordBatch> rest = batch;
+  // assemble vector of indices of fields in batch on which we'll partition
   for (const auto& partition_field : schema_->fields()) {
     ARROW_ASSIGN_OR_RAISE(
-        auto match, FieldRef(partition_field->name()).FindOneOrNone(*rest->schema()))
-    if (match.empty()) continue;
+        auto match, FieldRef(partition_field->name()).FindOneOrNone(*batch->schema()))
 
-    by_fields.push_back(partition_field);
-    by_columns.push_back(rest->column(match[0]));
-    ARROW_ASSIGN_OR_RAISE(rest, rest->RemoveColumn(match[0]));
+    if (match.empty()) continue;
+    key_indices.push_back(match[0]);
+    ++num_keys;
   }
 
-  if (by_fields.empty()) {
+  if (key_indices.empty()) {
     // no fields to group by; return the whole batch
     return PartitionedBatches{{batch}, {literal(true)}};
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto by,
-                        StructArray::Make(std::move(by_columns), std::move(by_fields)));
-  ARROW_ASSIGN_OR_RAISE(auto groupings_and_values, MakeGroupings(*by));
-  auto groupings =
-      checked_pointer_cast<ListArray>(groupings_and_values->GetFieldByName("groupings"));
-  auto unique_rows = groupings_and_values->GetFieldByName("values");
+  // assemble an ExecBatch of the key columns
+  compute::ExecBatch key_batch({}, batch->num_rows());
+  for (int i : key_indices) {
+    key_batch.values.emplace_back(batch->column_data(i));
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto grouper,
+                        compute::internal::Grouper::Make(key_batch.GetDescriptors()));
+
+  ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
+
+  auto ids = id_batch.array_as<UInt32Array>();
+  ARROW_ASSIGN_OR_RAISE(auto groupings, compute::internal::Grouper::MakeGroupings(
+                                            *ids, grouper->num_groups()));
+
+  ARROW_ASSIGN_OR_RAISE(auto uniques, grouper->GetUniques());
+  ArrayVector unique_arrays(num_keys);
+  for (int i = 0; i < num_keys; ++i) {
+    unique_arrays[i] = uniques.values[i].make_array();
+  }
 
   PartitionedBatches out;
-  ARROW_ASSIGN_OR_RAISE(out.batches, ApplyGroupings(*groupings, rest));
-  out.expressions.resize(out.batches.size());
 
-  for (size_t i = 0; i < out.batches.size(); ++i) {
-    ARROW_ASSIGN_OR_RAISE(auto row, unique_rows->GetScalar(i));
-    out.expressions[i] = ConjunctionFromGroupingRow(row.get());
+  // assemble partition expressions from the unique keys
+  out.expressions.resize(grouper->num_groups());
+  for (uint32_t group = 0; group < grouper->num_groups(); ++group) {
+    std::vector<Expression> exprs(num_keys);
+
+    for (int i = 0; i < num_keys; ++i) {
+      ARROW_ASSIGN_OR_RAISE(auto val, unique_arrays[i]->GetScalar(group));
+      const auto& name = batch->schema()->field(key_indices[i])->name();
+
+      exprs[i] = val->is_valid ? equal(field_ref(name), literal(std::move(val)))
+                               : is_null(field_ref(name));
+    }
+    out.expressions[group] = and_(std::move(exprs));
+  }
+
+  // remove key columns from batch to which we'll be applying the groupings
+  auto rest = batch;
+  std::sort(key_indices.begin(), key_indices.end(), std::greater<int>());
+  for (int i : key_indices) {
+    // indices are in descending order; indices larger than i (which would be invalidated
+    // here) have already been handled
+    ARROW_ASSIGN_OR_RAISE(rest, rest->RemoveColumn(i));
   }
+  ARROW_ASSIGN_OR_RAISE(out.batches, ApplyGroupings(*groupings, rest));
+
   return out;
 }
 
@@ -272,10 +306,11 @@ Result<std::string> DirectoryPartitioning::FormatValues(
   return fs::internal::JoinAbstractPath(std::move(segments));
 }
 
+namespace {
 class KeyValuePartitioningFactory : public PartitioningFactory {
  protected:
   explicit KeyValuePartitioningFactory(PartitioningFactoryOptions options)
-      : options_(options) {}
+      : options_(std::move(options)) {}
 
   int GetOrInsertField(const std::string& name) {
     auto it_inserted =
@@ -436,6 +471,8 @@ class DirectoryPartitioningFactory : public KeyValuePartitioningFactory {
   std::vector<std::string> field_names_;
 };
 
+}  // namespace
+
 std::shared_ptr<PartitioningFactory> DirectoryPartitioning::MakeFactory(
     std::vector<std::string> field_names, PartitioningFactoryOptions options) {
   return std::shared_ptr<PartitioningFactory>(
@@ -576,243 +613,5 @@ Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
   return factory()->Inspect(paths);
 }
 
-// Transform an array of counts to offsets which will divide a ListArray
-// into an equal number of slices with corresponding lengths.
-inline Result<std::shared_ptr<Buffer>> CountsToOffsets(
-    std::shared_ptr<Int64Array> counts) {
-  TypedBufferBuilder<int32_t> offset_builder;
-  RETURN_NOT_OK(offset_builder.Resize(counts->length() + 1));
-
-  int32_t current_offset = 0;
-  offset_builder.UnsafeAppend(current_offset);
-
-  for (int64_t i = 0; i < counts->length(); ++i) {
-    DCHECK_NE(counts->Value(i), 0);
-    current_offset += static_cast<int32_t>(counts->Value(i));
-    offset_builder.UnsafeAppend(current_offset);
-  }
-
-  std::shared_ptr<Buffer> offsets;
-  RETURN_NOT_OK(offset_builder.Finish(&offsets));
-  return offsets;
-}
-
-// Helper for simultaneous dictionary encoding of multiple arrays.
-//
-// The fused dictionary is the Cartesian product of the individual dictionaries.
-// For example given two arrays A, B where A has unique values ["ex", "why"]
-// and B has unique values [0, 1] the fused dictionary is the set of tuples
-// [["ex", 0], ["ex", 1], ["why", 0], ["ex", 1]].
-//
-// TODO(bkietz) this capability belongs in an Action of the hash kernels, where
-// it can be used to group aggregates without materializing a grouped batch.
-// For the purposes of writing we need the materialized grouped batch anyway
-// since no Writers accept a selection vector.
-class StructDictionary {
- public:
-  struct Encoded {
-    std::shared_ptr<Int32Array> indices;
-    std::shared_ptr<StructDictionary> dictionary;
-  };
-
-  static Result<Encoded> Encode(const ArrayVector& columns) {
-    Encoded out{nullptr, std::make_shared<StructDictionary>()};
-
-    for (const auto& column : columns) {
-      RETURN_NOT_OK(out.dictionary->AddOne(column, &out.indices));
-    }
-
-    return out;
-  }
-
-  Result<std::shared_ptr<StructArray>> Decode(std::shared_ptr<Int32Array> fused_indices,
-                                              FieldVector fields) {
-    std::vector<Int32Builder> builders(dictionaries_.size());
-    for (Int32Builder& b : builders) {
-      RETURN_NOT_OK(b.Resize(fused_indices->length()));
-    }
-
-    std::vector<int32_t> codes(dictionaries_.size());
-    for (int64_t i = 0; i < fused_indices->length(); ++i) {
-      Expand(fused_indices->Value(i), codes.data());
-
-      auto builder_it = builders.begin();
-      for (int32_t index : codes) {
-        builder_it++->UnsafeAppend(index);
-      }
-    }
-
-    ArrayVector columns(dictionaries_.size());
-    for (size_t i = 0; i < dictionaries_.size(); ++i) {
-      std::shared_ptr<ArrayData> indices;
-      RETURN_NOT_OK(builders[i].FinishInternal(&indices));
-
-      ARROW_ASSIGN_OR_RAISE(Datum column, compute::Take(dictionaries_[i], indices));
-
-      if (fields[i]->type()->id() == Type::DICTIONARY) {
-        RETURN_NOT_OK(RestoreDictionaryEncoding(
-            checked_pointer_cast<DictionaryType>(fields[i]->type()), &column));
-      }
-
-      columns[i] = column.make_array();
-    }
-
-    return StructArray::Make(std::move(columns), std::move(fields));
-  }
-
- private:
-  Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
-    if (column.type()->id() == Type::DICTIONARY) {
-      if (column.null_count() != 0) {
-        // TODO(ARROW-11732) Optimize this by allowign DictionaryEncode to transfer a
-        // null-masked dictionary to a null-encoded dictionary.  At the moment we decode
-        // and then encode causing one extra copy, and a potentially expansive decoding
-        // copy at that.
-        ARROW_ASSIGN_OR_RAISE(
-            auto decoded_dictionary,
-            compute::Cast(
-                column,
-                std::static_pointer_cast<DictionaryType>(column.type())->value_type(),
-                compute::CastOptions()));
-        column = decoded_dictionary;
-      }
-    }
-    if (column.type()->id() != Type::DICTIONARY) {
-      compute::DictionaryEncodeOptions options;
-      options.null_encoding_behavior =
-          compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE;
-      ARROW_ASSIGN_OR_RAISE(column,
-                            compute::DictionaryEncode(std::move(column), options));
-    }
-
-    auto dict_column = column.array_as<DictionaryArray>();
-    dictionaries_.push_back(dict_column->dictionary());
-    ARROW_ASSIGN_OR_RAISE(auto indices, compute::Cast(*dict_column->indices(), int32()));
-
-    if (*fused_indices == nullptr) {
-      *fused_indices = checked_pointer_cast<Int32Array>(std::move(indices));
-      return IncreaseSize();
-    }
-
-    // It's useful to think about the case where each of dictionaries_ has size 10.
-    // In this case the decimal digit in the ones place is the code in dictionaries_[0],
-    // the tens place corresponds to the code in dictionaries_[1], etc.
-    // The incumbent indices must be shifted to the hundreds place so as not to collide.
-    ARROW_ASSIGN_OR_RAISE(Datum new_fused_indices,
-                          compute::Multiply(indices, MakeScalar(size_)));
-
-    ARROW_ASSIGN_OR_RAISE(new_fused_indices,
-                          compute::Add(new_fused_indices, *fused_indices));
-
-    *fused_indices = checked_pointer_cast<Int32Array>(new_fused_indices.make_array());
-    return IncreaseSize();
-  }
-
-  // expand a fused code into component dict codes, order is in order of addition
-  void Expand(int32_t fused_code, int32_t* codes) {
-    for (size_t i = 0; i < dictionaries_.size(); ++i) {
-      auto dictionary_size = static_cast<int32_t>(dictionaries_[i]->length());
-      codes[i] = fused_code % dictionary_size;
-      fused_code /= dictionary_size;
-    }
-  }
-
-  Status RestoreDictionaryEncoding(std::shared_ptr<DictionaryType> expected_type,
-                                   Datum* column) {
-    DCHECK_NE(column->type()->id(), Type::DICTIONARY);
-    ARROW_ASSIGN_OR_RAISE(*column, compute::DictionaryEncode(std::move(*column)));
-
-    if (expected_type->index_type()->id() == Type::INT32) {
-      // dictionary_encode has already yielded the expected index_type
-      return Status::OK();
-    }
-
-    // cast the indices to the expected index type
-    auto dictionary = std::move(column->mutable_array()->dictionary);
-    column->mutable_array()->type = int32();
-
-    ARROW_ASSIGN_OR_RAISE(*column,
-                          compute::Cast(std::move(*column), expected_type->index_type()));
-
-    column->mutable_array()->dictionary = std::move(dictionary);
-    column->mutable_array()->type = expected_type;
-    return Status::OK();
-  }
-
-  Status IncreaseSize() {
-    auto factor = static_cast<int32_t>(dictionaries_.back()->length());
-
-    if (internal::MultiplyWithOverflow(size_, factor, &size_)) {
-      return Status::CapacityError("Max groups exceeded");
-    }
-    return Status::OK();
-  }
-
-  int32_t size_ = 1;
-  ArrayVector dictionaries_;
-};
-
-Result<std::shared_ptr<StructArray>> MakeGroupings(const StructArray& by) {
-  if (by.num_fields() == 0) {
-    return Status::Invalid("Grouping with no criteria");
-  }
-
-  if (by.null_count() != 0) {
-    return Status::Invalid("Grouping with null criteria");
-  }
-
-  ARROW_ASSIGN_OR_RAISE(auto fused, StructDictionary::Encode(by.fields()));
-
-  ARROW_ASSIGN_OR_RAISE(auto sort_indices, compute::SortIndices(*fused.indices));
-  ARROW_ASSIGN_OR_RAISE(Datum sorted, compute::Take(fused.indices, *sort_indices));
-  fused.indices = checked_pointer_cast<Int32Array>(sorted.make_array());
-
-  ARROW_ASSIGN_OR_RAISE(auto fused_counts_and_values,
-                        compute::ValueCounts(fused.indices));
-  fused.indices.reset();
-
-  auto unique_fused_indices =
-      checked_pointer_cast<Int32Array>(fused_counts_and_values->GetFieldByName("values"));
-  ARROW_ASSIGN_OR_RAISE(
-      auto unique_rows,
-      fused.dictionary->Decode(std::move(unique_fused_indices), by.type()->fields()));
-
-  auto counts =
-      checked_pointer_cast<Int64Array>(fused_counts_and_values->GetFieldByName("counts"));
-  ARROW_ASSIGN_OR_RAISE(auto offsets, CountsToOffsets(std::move(counts)));
-
-  auto grouped_sort_indices =
-      std::make_shared<ListArray>(list(sort_indices->type()), unique_rows->length(),
-                                  std::move(offsets), std::move(sort_indices));
-
-  return StructArray::Make(
-      ArrayVector{std::move(unique_rows), std::move(grouped_sort_indices)},
-      std::vector<std::string>{"values", "groupings"});
-}
-
-Result<std::shared_ptr<ListArray>> ApplyGroupings(const ListArray& groupings,
-                                                  const Array& array) {
-  ARROW_ASSIGN_OR_RAISE(Datum sorted,
-                        compute::Take(array, groupings.data()->child_data[0]));
-
-  return std::make_shared<ListArray>(list(array.type()), groupings.length(),
-                                     groupings.value_offsets(), sorted.make_array());
-}
-
-Result<RecordBatchVector> ApplyGroupings(const ListArray& groupings,
-                                         const std::shared_ptr<RecordBatch>& batch) {
-  ARROW_ASSIGN_OR_RAISE(Datum sorted,
-                        compute::Take(batch, groupings.data()->child_data[0]));
-
-  const auto& sorted_batch = *sorted.record_batch();
-
-  RecordBatchVector out(static_cast<size_t>(groupings.length()));
-  for (size_t i = 0; i < out.size(); ++i) {
-    out[i] = sorted_batch.Slice(groupings.value_offset(i), groupings.value_length(i));
-  }
-
-  return out;
-}
-
 }  // namespace dataset
 }  // namespace arrow
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index c49ac5e923e..74e6c607106 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -200,7 +200,7 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
   explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
                             std::string null_fallback = kDefaultHiveNullFallback)
       : KeyValuePartitioning(std::move(schema), std::move(dictionaries)),
-        null_fallback_(null_fallback) {}
+        null_fallback_(std::move(null_fallback)) {}
 
   std::string type_name() const override { return "hive"; }
   std::string null_fallback() const { return null_fallback_; }
@@ -299,64 +299,5 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
   std::shared_ptr<Partitioning> partitioning_;
 };
 
-/// \brief Assemble lists of indices of identical rows.
-///
-/// \param[in] by A StructArray whose columns will be used as grouping criteria.
-///               Top level nulls are invalid, as are empty criteria (no grouping
-///               columns).
-/// \return A array of type `struct<values: by.type, groupings: list<int64>>`,
-///         which is a mapping from unique rows (field "values") to lists of
-///         indices into `by` where that row appears (field "groupings").
-///
-/// For example,
-///   MakeGroupings([
-///       {"a": "ex",  "b": 0},
-///       {"a": "ex",  "b": 0},
-///       {"a": "why", "b": 0},
-///       {"a": "why", "b": 0},
-///       {"a": "ex",  "b": 0},
-///       {"a": "why", "b": 1}
-///   ]) == [
-///       {"values": {"a": "ex",  "b": 0}, "groupings": [0, 1, 4]},
-///       {"values": {"a": "why", "b": 0}, "groupings": [2, 3]},
-///       {"values": {"a": "why", "b": 1}, "groupings": [5]}
-///   ]
-ARROW_DS_EXPORT
-Result<std::shared_ptr<StructArray>> MakeGroupings(const StructArray& by);
-
-/// \brief Produce a ListArray whose slots are selections of `array` which correspond to
-/// the provided groupings.
-///
-/// For example,
-///   ApplyGroupings([[0, 1, 4], [2, 3], [5]], [
-///       {"a": "ex",  "b": 0, "passenger": 0},
-///       {"a": "ex",  "b": 0, "passenger": 1},
-///       {"a": "why", "b": 0, "passenger": 2},
-///       {"a": "why", "b": 0, "passenger": 3},
-///       {"a": "ex",  "b": 0, "passenger": 4},
-///       {"a": "why", "b": 1, "passenger": 5}
-///   ]) == [
-///     [
-///       {"a": "ex",  "b": 0, "passenger": 0},
-///       {"a": "ex",  "b": 0, "passenger": 1},
-///       {"a": "ex",  "b": 0, "passenger": 4},
-///     ],
-///     [
-///       {"a": "why", "b": 0, "passenger": 2},
-///       {"a": "why", "b": 0, "passenger": 3},
-///     ],
-///     [
-///       {"a": "why", "b": 1, "passenger": 5}
-///     ]
-///   ]
-ARROW_DS_EXPORT
-Result<std::shared_ptr<ListArray>> ApplyGroupings(const ListArray& groupings,
-                                                  const Array& array);
-
-/// \brief Produce selections of a RecordBatch which correspond to the provided groupings.
-ARROW_DS_EXPORT
-Result<RecordBatchVector> ApplyGroupings(const ListArray& groupings,
-                                         const std::shared_ptr<RecordBatch>& batch);
-
 }  // namespace dataset
 }  // namespace arrow
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 456b2852311..06c3cc67674 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -138,9 +138,13 @@ class TestPartitioning : public ::testing::Test {
 };
 
 TEST_F(TestPartitioning, Partition) {
+  auto dataset_schema =
+      schema({field("a", int32()), field("b", utf8()), field("c", uint32())});
+
   auto partition_schema = schema({field("a", int32()), field("b", utf8())});
-  auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c", uint32())});
-  auto remaining_schema = schema({field("c", uint32())});
+
+  auto physical_schema = schema({field("c", uint32())});
+
   auto partitioning = std::make_shared<DirectoryPartitioning>(partition_schema);
   std::string json = R"([{"a": 3,    "b": "x",  "c": 0},
                          {"a": 3,    "b": "x",  "c": 1},
@@ -149,15 +153,22 @@ TEST_F(TestPartitioning, Partition) {
                          {"a": null, "b": "z",  "c": 4},
                          {"a": null, "b": null, "c": 5}
                        ])";
-  std::vector<std::string> expected_batches = {R"([{"c": 0}, {"c": 1}])", R"([{"c": 2}])",
-                                               R"([{"c": 3}, {"c": 5}])",
-                                               R"([{"c": 4}])"};
+
+  std::vector<std::string> expected_batches = {
+      R"([{"c": 0}, {"c": 1}])",
+      R"([{"c": 2}])",
+      R"([{"c": 3}, {"c": 5}])",
+      R"([{"c": 4}])",
+  };
+
   std::vector<Expression> expected_expressions = {
       and_(equal(field_ref("a"), literal(3)), equal(field_ref("b"), literal("x"))),
       and_(equal(field_ref("a"), literal(1)), is_null(field_ref("b"))),
       and_(is_null(field_ref("a")), is_null(field_ref("b"))),
-      and_(is_null(field_ref("a")), equal(field_ref("b"), literal("z")))};
-  AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches,
+      and_(is_null(field_ref("a")), equal(field_ref("b"), literal("z"))),
+  };
+
+  AssertPartition(partitioning, dataset_schema, json, physical_schema, expected_batches,
                   expected_expressions);
 }
 
@@ -713,132 +724,5 @@ TEST(TestStripPrefixAndFilename, Basic) {
                                    "year=2019/month=12/day=01"));
 }
 
-void AssertGrouping(const FieldVector& by_fields, const std::string& batch_json,
-                    const std::string& expected_json) {
-  FieldVector fields_with_ids = by_fields;
-  fields_with_ids.push_back(field("ids", list(int32())));
-  auto expected = ArrayFromJSON(struct_(fields_with_ids), expected_json);
-
-  FieldVector fields_with_id = by_fields;
-  fields_with_id.push_back(field("id", int32()));
-  auto batch = RecordBatchFromJSON(schema(fields_with_id), batch_json);
-
-  ASSERT_OK_AND_ASSIGN(auto by, batch->RemoveColumn(batch->num_columns() - 1)
-                                    .Map([](std::shared_ptr<RecordBatch> by) {
-                                      return by->ToStructArray();
-                                    }));
-
-  ASSERT_OK_AND_ASSIGN(auto groupings_and_values, MakeGroupings(*by));
-  ASSERT_OK(groupings_and_values->ValidateFull());
-
-  auto groupings =
-      checked_pointer_cast<ListArray>(groupings_and_values->GetFieldByName("groupings"));
-
-  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> grouped_ids,
-                       ApplyGroupings(*groupings, *batch->GetColumnByName("id")));
-  ASSERT_OK(grouped_ids->ValidateFull());
-
-  ArrayVector columns =
-      checked_cast<const StructArray&>(*groupings_and_values->GetFieldByName("values"))
-          .fields();
-  columns.push_back(grouped_ids);
-
-  ASSERT_OK_AND_ASSIGN(auto actual, StructArray::Make(columns, fields_with_ids));
-  ASSERT_OK(actual->ValidateFull());
-
-  AssertArraysEqual(*expected, *actual, /*verbose=*/true);
-}
-
-TEST(GroupTest, Basics) {
-  AssertGrouping({field("a", utf8()), field("b", int32())}, R"([
-    {"a": "ex",  "b": 0, "id": 0},
-    {"a": "ex",  "b": 0, "id": 1},
-    {"a": "why", "b": 0, "id": 2},
-    {"a": "ex",  "b": 1, "id": 3},
-    {"a": "why", "b": 0, "id": 4},
-    {"a": "ex",  "b": 1, "id": 5},
-    {"a": "ex",  "b": 0, "id": 6},
-    {"a": "why", "b": 1, "id": 7}
-  ])",
-                 R"([
-    {"a": "ex",  "b": 0, "ids": [0, 1, 6]},
-    {"a": "why", "b": 0, "ids": [2, 4]},
-    {"a": "ex",  "b": 1, "ids": [3, 5]},
-    {"a": "why", "b": 1, "ids": [7]}
-  ])");
-}
-
-TEST(GroupTest, WithNulls) {
-  AssertGrouping({field("a", utf8()), field("b", int32())},
-                 R"([
-                   {"a": "ex",  "b": 0,    "id": 0},
-                   {"a": null,  "b": 0,    "id": 1},
-                   {"a": null,  "b": 0,    "id": 2},
-                   {"a": "ex",  "b": 1,    "id": 3},
-                   {"a": null,  "b": null, "id": 4},
-                   {"a": "ex",  "b": 1,    "id": 5},
-                   {"a": "ex",  "b": 0,    "id": 6},
-                   {"a": "why", "b": null, "id": 7}
-                 ])",
-                 R"([
-                   {"a": "ex", "b": 0, "ids": [0, 6]},
-                   {"a": null, "b": 0, "ids": [1, 2]},
-                   {"a": "ex", "b": 1, "ids": [3, 5]},
-                   {"a": null, "b": null, "ids": [4]},
-                   {"a": "why", "b": null, "ids": [7]}
-  ])");
-
-  AssertGrouping({field("a", dictionary(int32(), utf8())), field("b", int32())},
-                 R"([
-                   {"a": "ex",  "b": 0,    "id": 0},
-                   {"a": null,  "b": 0,    "id": 1},
-                   {"a": null,  "b": 0,    "id": 2},
-                   {"a": "ex",  "b": 1,    "id": 3},
-                   {"a": null,  "b": null, "id": 4},
-                   {"a": "ex",  "b": 1,    "id": 5},
-                   {"a": "ex",  "b": 0,    "id": 6},
-                   {"a": "why", "b": null, "id": 7}
-                 ])",
-                 R"([
-                   {"a": "ex", "b": 0, "ids": [0, 6]},
-                   {"a": null, "b": 0, "ids": [1, 2]},
-                   {"a": "ex", "b": 1, "ids": [3, 5]},
-                   {"a": null, "b": null, "ids": [4]},
-                   {"a": "why", "b": null, "ids": [7]}
-  ])");
-
-  auto has_nulls = checked_pointer_cast<StructArray>(
-      ArrayFromJSON(struct_({field("a", utf8()), field("b", int32())}), R"([
-    {"a": "ex",  "b": 0},
-    null,
-    {"a": "why", "b": 0},
-    {"a": "ex",  "b": 1},
-    {"a": "why", "b": 0},
-    {"a": "ex",  "b": 1},
-    {"a": "ex",  "b": 0},
-    null
-  ])"));
-  ASSERT_RAISES(Invalid, MakeGroupings(*has_nulls));
-}
-
-TEST(GroupTest, GroupOnDictionary) {
-  AssertGrouping({field("a", dictionary(int32(), utf8())), field("b", int32())}, R"([
-    {"a": "ex",  "b": 0, "id": 0},
-    {"a": "ex",  "b": 0, "id": 1},
-    {"a": "why", "b": 0, "id": 2},
-    {"a": "ex",  "b": 1, "id": 3},
-    {"a": "why", "b": 0, "id": 4},
-    {"a": "ex",  "b": 1, "id": 5},
-    {"a": "ex",  "b": 0, "id": 6},
-    {"a": "why", "b": 1, "id": 7}
-  ])",
-                 R"([
-    {"a": "ex",  "b": 0, "ids": [0, 1, 6]},
-    {"a": "why", "b": 0, "ids": [2, 4]},
-    {"a": "ex",  "b": 1, "ids": [3, 5]},
-    {"a": "why", "b": 1, "ids": [7]}
-  ])");
-}
-
 }  // namespace dataset
 }  // namespace arrow
diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index 1aca9fa4882..dee96ceb836 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -149,6 +149,12 @@ Status ScannerBuilder::Pool(MemoryPool* pool) {
   return Status::OK();
 }
 
+Status ScannerBuilder::FragmentScanOptions(
+    std::shared_ptr<dataset::FragmentScanOptions> fragment_scan_options) {
+  scan_options_->fragment_scan_options = std::move(fragment_scan_options);
+  return Status::OK();
+}
+
 Result<std::shared_ptr<Scanner>> ScannerBuilder::Finish() {
   if (!scan_options_->projection.IsBound()) {
     RETURN_NOT_OK(Project(scan_options_->dataset_schema->field_names()));
diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 6e06af06066..df5f7954afe 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -240,6 +240,9 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// \brief Set the pool from which materialized and scanned arrays will be allocated.
   Status Pool(MemoryPool* pool);
 
+  /// \brief Set fragment-specific scan options.
+  Status FragmentScanOptions(std::shared_ptr<FragmentScanOptions> fragment_scan_options);
+
   /// \brief Return the constructed now-immutable Scanner object
   Result<std::shared_ptr<Scanner>> Finish();
 
diff --git a/cpp/src/arrow/dataset/type_fwd.h b/cpp/src/arrow/dataset/type_fwd.h
index 62395ad1a6e..d148d4ee2d3 100644
--- a/cpp/src/arrow/dataset/type_fwd.h
+++ b/cpp/src/arrow/dataset/type_fwd.h
@@ -46,6 +46,8 @@ class Fragment;
 using FragmentIterator = Iterator<std::shared_ptr<Fragment>>;
 using FragmentVector = std::vector<std::shared_ptr<Fragment>>;
 
+class FragmentScanOptions;
+
 class FileSource;
 class FileFormat;
 class FileFragment;
@@ -58,6 +60,7 @@ struct FileSystemDatasetWriteOptions;
 class InMemoryDataset;
 
 class CsvFileFormat;
+struct CsvFragmentScanOptions;
 
 class IpcFileFormat;
 class IpcFileWriter;
diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc
index 880445e5f24..84973f033a3 100644
--- a/cpp/src/arrow/flight/types.cc
+++ b/cpp/src/arrow/flight/types.cc
@@ -78,16 +78,11 @@ std::shared_ptr<FlightStatusDetail> FlightStatusDetail::UnwrapStatus(
   return std::dynamic_pointer_cast<FlightStatusDetail>(status.detail());
 }
 
-Status MakeFlightError(FlightStatusCode code, const std::string& message) {
+Status MakeFlightError(FlightStatusCode code, std::string message,
+                       std::string extra_info) {
   StatusCode arrow_code = arrow::StatusCode::IOError;
-  return arrow::Status(arrow_code, message, std::make_shared<FlightStatusDetail>(code));
-}
-
-Status MakeFlightError(FlightStatusCode code, const std::string& message,
-                       const std::string& extra_info) {
-  StatusCode arrow_code = arrow::StatusCode::IOError;
-  return arrow::Status(arrow_code, message,
-                       std::make_shared<FlightStatusDetail>(code, extra_info));
+  return arrow::Status(arrow_code, std::move(message),
+                       std::make_shared<FlightStatusDetail>(code, std::move(extra_info)));
 }
 
 bool FlightDescriptor::Equals(const FlightDescriptor& other) const {
diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h
index 2c0d8be9966..7538e4beb13 100644
--- a/cpp/src/arrow/flight/types.h
+++ b/cpp/src/arrow/flight/types.h
@@ -115,20 +115,12 @@ class ARROW_FLIGHT_EXPORT FlightStatusDetail : public arrow::StatusDetail {
 /// \brief Make an appropriate Arrow status for the given
 /// Flight-specific status.
 ///
-/// \param code The status code.
+/// \param code The Flight status code.
 /// \param message The message for the error.
+/// \param extra_info Optional extra binary info for the error (eg protobuf)
 ARROW_FLIGHT_EXPORT
-Status MakeFlightError(FlightStatusCode code, const std::string& message);
-
-/// \brief Make an appropriate Arrow status for the given
-/// Flight-specific status.
-///
-/// \param code The status code.
-/// \param message The message for the error.
-/// \param extra_info The extra binary info for the error (eg protobuf)
-ARROW_FLIGHT_EXPORT
-Status MakeFlightError(FlightStatusCode code, const std::string& message,
-                       const std::string& extra_info);
+Status MakeFlightError(FlightStatusCode code, std::string message,
+                       std::string extra_info = {});
 
 /// \brief A TLS certificate plus key.
 struct ARROW_FLIGHT_EXPORT CertKeyPair {
diff --git a/cpp/src/arrow/ipc/dictionary.cc b/cpp/src/arrow/ipc/dictionary.cc
index fa634fe910b..3ab2c8b3847 100644
--- a/cpp/src/arrow/ipc/dictionary.cc
+++ b/cpp/src/arrow/ipc/dictionary.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -87,6 +88,16 @@ struct DictionaryFieldMapper::Impl {
 
   int num_fields() const { return static_cast<int>(field_path_to_id.size()); }
 
+  int num_dicts() const {
+    std::set<int64_t> uniqueIds;
+
+    for (auto& kv : field_path_to_id) {
+      uniqueIds.insert(kv.second);
+    }
+
+    return static_cast<int>(uniqueIds.size());
+  }
+
  private:
   void ImportFields(const FieldPosition& pos,
                     const std::vector<std::shared_ptr<Field>>& fields) {
@@ -140,6 +151,8 @@ Result<int64_t> DictionaryFieldMapper::GetFieldId(std::vector<int> field_path) c
 
 int DictionaryFieldMapper::num_fields() const { return impl_->num_fields(); }
 
+int DictionaryFieldMapper::num_dicts() const { return impl_->num_dicts(); }
+
 // ----------------------------------------------------------------------
 // DictionaryMemo implementation
 
diff --git a/cpp/src/arrow/ipc/dictionary.h b/cpp/src/arrow/ipc/dictionary.h
index 263443a43cf..e4287cb1974 100644
--- a/cpp/src/arrow/ipc/dictionary.h
+++ b/cpp/src/arrow/ipc/dictionary.h
@@ -80,6 +80,10 @@ class ARROW_EXPORT DictionaryFieldMapper {
 
   int num_fields() const;
 
+  /// \brief Returns number of unique dictionaries, taking into
+  /// account that different fields can share the same dictionary.
+  int num_dicts() const;
+
  private:
   struct Impl;
   std::unique_ptr<Impl> impl_;
diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc
index 6a1f5ffe0eb..4b332bd9e1e 100644
--- a/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/cpp/src/arrow/ipc/metadata_internal.cc
@@ -753,12 +753,15 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos,
 
   // Reconstruct the data type
   // 1. Data type children
+  FieldVector child_fields;
   const auto& children = field->children();
-  CHECK_FLATBUFFERS_NOT_NULL(children, "Field.children");
-  std::vector<std::shared_ptr<Field>> child_fields(children->size());
-  for (int i = 0; i < static_cast<int>(children->size()); ++i) {
-    RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), field_pos.child(i),
-                                      dictionary_memo, &child_fields[i]));
+  // As a tolerance, allow for a null children field meaning "no children" (ARROW-12100)
+  if (children != nullptr) {
+    child_fields.resize(children->size());
+    for (int i = 0; i < static_cast<int>(children->size()); ++i) {
+      RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), field_pos.child(i),
+                                        dictionary_memo, &child_fields[i]));
+    }
   }
 
   // 2. Top-level concrete data type
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 7e39ee1c484..16ac0fe9c23 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -885,7 +885,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
 
     // TODO(wesm): In future, we may want to reconcile the ids in the stream with
     // those found in the schema
-    const auto num_dicts = dictionary_memo_.fields().num_fields();
+    const auto num_dicts = dictionary_memo_.fields().num_dicts();
     for (int i = 0; i < num_dicts; ++i) {
       ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
       if (!message) {
diff --git a/cpp/src/arrow/json/CMakeLists.txt b/cpp/src/arrow/json/CMakeLists.txt
index 5f26a2e1ebd..f09b15ce51c 100644
--- a/cpp/src/arrow/json/CMakeLists.txt
+++ b/cpp/src/arrow/json/CMakeLists.txt
@@ -17,6 +17,7 @@
 
 add_arrow_test(test
                SOURCES
+               chunked_builder_test.cc
                chunker_test.cc
                converter_test.cc
                parser_test.cc
diff --git a/cpp/src/arrow/json/chunked_builder.cc b/cpp/src/arrow/json/chunked_builder.cc
index 01385d2b8e1..040009c764f 100644
--- a/cpp/src/arrow/json/chunked_builder.cc
+++ b/cpp/src/arrow/json/chunked_builder.cc
@@ -27,11 +27,13 @@
 #include "arrow/buffer.h"
 #include "arrow/json/converter.h"
 #include "arrow/table.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/task_group.h"
 
 namespace arrow {
 
+using internal::checked_cast;
 using internal::TaskGroup;
 
 namespace json {
@@ -199,15 +201,6 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder {
               const std::shared_ptr<Array>& unconverted) override {
     std::unique_lock<std::mutex> lock(mutex_);
 
-    auto list_array = static_cast<const ListArray*>(unconverted.get());
-
-    if (null_bitmap_chunks_.size() <= static_cast<size_t>(block_index)) {
-      null_bitmap_chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr);
-      offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr);
-    }
-    null_bitmap_chunks_[block_index] = unconverted->null_bitmap();
-    offset_chunks_[block_index] = list_array->value_offsets();
-
     if (unconverted->type_id() == Type::NA) {
       auto st = InsertNull(block_index, unconverted->length());
       if (!st.ok()) {
@@ -217,8 +210,17 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder {
     }
 
     DCHECK_EQ(unconverted->type_id(), Type::LIST);
-    value_builder_->Insert(block_index, list_array->list_type()->value_field(),
-                           list_array->values());
+    const auto& list_array = checked_cast<const ListArray&>(*unconverted);
+
+    if (null_bitmap_chunks_.size() <= static_cast<size_t>(block_index)) {
+      null_bitmap_chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr);
+      offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr);
+    }
+    null_bitmap_chunks_[block_index] = unconverted->null_bitmap();
+    offset_chunks_[block_index] = list_array.value_offsets();
+
+    value_builder_->Insert(block_index, list_array.list_type()->value_field(),
+                           list_array.values());
   }
 
   Status Finish(std::shared_ptr<ChunkedArray>* out) override {
@@ -305,17 +307,17 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder {
       return;
     }
 
-    auto struct_array = std::static_pointer_cast<StructArray>(unconverted);
+    const auto& struct_array = checked_cast<const StructArray&>(*unconverted);
     if (promotion_graph_ == nullptr) {
       // If unexpected fields are ignored or result in an error then all parsers will emit
       // columns exclusively in the ordering specified in ParseOptions::explicit_schema,
       // so child_builders_ is immutable and no associative lookup is necessary.
       for (int i = 0; i < unconverted->num_fields(); ++i) {
         child_builders_[i]->Insert(block_index, unconverted->type()->field(i),
-                                   struct_array->field(i));
+                                   struct_array.field(i));
       }
     } else {
-      auto st = InsertChildren(block_index, struct_array.get());
+      auto st = InsertChildren(block_index, struct_array);
       if (!st.ok()) {
         return task_group_->Append([st] { return st; });
       }
@@ -383,10 +385,10 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder {
   // Insert children associatively by name; the unconverted block may have unexpected or
   // differently ordered fields
   // call from Insert() only, with mutex_ locked
-  Status InsertChildren(int64_t block_index, const StructArray* unconverted) {
-    const auto& fields = unconverted->type()->fields();
+  Status InsertChildren(int64_t block_index, const StructArray& unconverted) {
+    const auto& fields = unconverted.type()->fields();
 
-    for (int i = 0; i < unconverted->num_fields(); ++i) {
+    for (int i = 0; i < unconverted.num_fields(); ++i) {
       auto it = name_to_index_.find(fields[i]->name());
 
       if (it == name_to_index_.end()) {
@@ -405,9 +407,9 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder {
         child_builders_.emplace_back(std::move(child_builder));
       }
 
-      auto unconverted_field = unconverted->type()->field(i);
+      auto unconverted_field = unconverted.type()->field(i);
       child_builders_[it->second]->Insert(block_index, unconverted_field,
-                                          unconverted->field(i));
+                                          unconverted.field(i));
 
       child_absent_[block_index].resize(child_builders_.size(), true);
       child_absent_[block_index][it->second] = false;
@@ -444,12 +446,12 @@ Status MakeChunkedArrayBuilder(const std::shared_ptr<TaskGroup>& task_group,
     return Status::OK();
   }
   if (type->id() == Type::LIST) {
-    auto list_type = static_cast<const ListType*>(type.get());
+    const auto& list_type = checked_cast<const ListType&>(*type);
     std::shared_ptr<ChunkedArrayBuilder> value_builder;
     RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group, pool, promotion_graph,
-                                          list_type->value_type(), &value_builder));
+                                          list_type.value_type(), &value_builder));
     *out = std::make_shared<ChunkedListArrayBuilder>(
-        task_group, pool, std::move(value_builder), list_type->value_field());
+        task_group, pool, std::move(value_builder), list_type.value_field());
     return Status::OK();
   }
   std::shared_ptr<Converter> converter;
diff --git a/cpp/src/arrow/json/chunked_builder_test.cc b/cpp/src/arrow/json/chunked_builder_test.cc
index 5c57b4963bc..a3810316f76 100644
--- a/cpp/src/arrow/json/chunked_builder_test.cc
+++ b/cpp/src/arrow/json/chunked_builder_test.cc
@@ -40,7 +40,7 @@ using internal::checked_cast;
 using internal::GetCpuThreadPool;
 using internal::TaskGroup;
 
-void AssertBuilding(const std::unique_ptr<ChunkedArrayBuilder>& builder,
+void AssertBuilding(const std::shared_ptr<ChunkedArrayBuilder>& builder,
                     const std::vector<std::string>& chunks,
                     std::shared_ptr<ChunkedArray>* out) {
   ArrayVector unconverted;
@@ -67,9 +67,8 @@ std::shared_ptr<ChunkedArray> ExtractField(const std::string& name,
   for (auto& chunk : chunks) {
     chunk = checked_cast<const StructArray&>(*chunk).GetFieldByName(name);
   }
-  auto struct_type = static_cast<const StructType*>(columns.type().get());
-  return std::make_shared<ChunkedArray>(chunks,
-                                        struct_type->GetFieldByName(name)->type());
+  const auto& struct_type = checked_cast<const StructType&>(*columns.type());
+  return std::make_shared<ChunkedArray>(chunks, struct_type.GetFieldByName(name)->type());
 }
 
 void AssertFieldEqual(const std::vector<std::string>& path,
@@ -83,27 +82,9 @@ void AssertFieldEqual(const std::vector<std::string>& path,
   AssertChunkedEqual(expected, *actual);
 }
 
-template <typename T>
-std::string RowsOfOneColumn(string_view name, std::initializer_list<T> values,
-                            decltype(std::to_string(*values.begin()))* = nullptr) {
-  std::stringstream ss;
-  for (auto value : values) {
-    ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
-  }
-  return ss.str();
-}
-
-std::string RowsOfOneColumn(string_view name, std::initializer_list<std::string> values) {
-  std::stringstream ss;
-  for (auto value : values) {
-    ss << R"({")" << name << R"(":)" << value << "}\n";
-  }
-  return ss.str();
-}
-
 TEST(ChunkedArrayBuilder, Empty) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
                                     struct_({field("a", int32())}), &builder));
 
@@ -116,7 +97,7 @@ TEST(ChunkedArrayBuilder, Empty) {
 
 TEST(ChunkedArrayBuilder, Basics) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
                                     struct_({field("a", int32())}), &builder));
 
@@ -130,7 +111,7 @@ TEST(ChunkedArrayBuilder, Basics) {
 
 TEST(ChunkedArrayBuilder, Insert) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
                                     struct_({field("a", int32())}), &builder));
 
@@ -151,7 +132,7 @@ TEST(ChunkedArrayBuilder, Insert) {
 
 TEST(ChunkedArrayBuilder, MultipleChunks) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
                                     struct_({field("a", int32())}), &builder));
 
@@ -170,7 +151,7 @@ TEST(ChunkedArrayBuilder, MultipleChunks) {
 
 TEST(ChunkedArrayBuilder, MultipleChunksParallel) {
   auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
                                     struct_({field("a", int32())}), &builder));
 
@@ -194,7 +175,7 @@ TEST(ChunkedArrayBuilder, MultipleChunksParallel) {
 
 TEST(InferringChunkedArrayBuilder, Empty) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -207,7 +188,7 @@ TEST(InferringChunkedArrayBuilder, Empty) {
 
 TEST(InferringChunkedArrayBuilder, SingleChunkNull) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -224,7 +205,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkNull) {
 
 TEST(InferringChunkedArrayBuilder, MultipleChunkNull) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -244,7 +225,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkNull) {
 
 TEST(InferringChunkedArrayBuilder, SingleChunkInteger) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -264,7 +245,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkInteger) {
 
 TEST(InferringChunkedArrayBuilder, MultipleChunkInteger) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -285,7 +266,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkInteger) {
 
 TEST(InferringChunkedArrayBuilder, SingleChunkDouble) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -305,7 +286,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkDouble) {
 
 TEST(InferringChunkedArrayBuilder, MultipleChunkDouble) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -327,7 +308,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkDouble) {
 
 TEST(InferringChunkedArrayBuilder, SingleChunkTimestamp) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -348,7 +329,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkTimestamp) {
 
 TEST(InferringChunkedArrayBuilder, MultipleChunkTimestamp) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -371,7 +352,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkTimestamp) {
 
 TEST(InferringChunkedArrayBuilder, SingleChunkString) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -392,7 +373,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkString) {
 
 TEST(InferringChunkedArrayBuilder, MultipleChunkString) {
   auto tg = TaskGroup::MakeSerial();
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -415,7 +396,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkString) {
 
 TEST(InferringChunkedArrayBuilder, MultipleChunkIntegerParallel) {
   auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
-  std::unique_ptr<ChunkedArrayBuilder> builder;
+  std::shared_ptr<ChunkedArrayBuilder> builder;
   ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
                                     struct_({}), &builder));
 
@@ -433,5 +414,41 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkIntegerParallel) {
   AssertFieldEqual({"a"}, actual, *expected);
 }
 
+TEST(InferringChunkedArrayBuilder, SingleChunkList) {
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ChunkedArrayBuilder> builder;
+  ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
+                                    struct_({}), &builder));
+
+  std::shared_ptr<ChunkedArray> actual;
+  AssertBuilding(builder,
+                 {
+                     std::string("{}\n") + "{\"a\": []}\n" + "{\"a\": [1, 2]}\n",
+                 },
+                 &actual);
+
+  auto expected = ChunkedArrayFromJSON(list(int64()), {"[null, [], [1, 2]]"});
+  AssertFieldEqual({"a"}, actual, *expected);
+}
+
+TEST(InferringChunkedArrayBuilder, MultipleChunkList) {
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ChunkedArrayBuilder> builder;
+  ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
+                                    struct_({}), &builder));
+
+  std::shared_ptr<ChunkedArray> actual;
+  AssertBuilding(builder,
+                 {
+                     "{}\n",
+                     "{\"a\": []}\n",
+                     "{\"a\": [1, 2]}\n",
+                 },
+                 &actual);
+
+  auto expected = ChunkedArrayFromJSON(list(int64()), {"[null]", "[[]]", "[[1, 2]]"});
+  AssertFieldEqual({"a"}, actual, *expected);
+}
+
 }  // namespace json
 }  // namespace arrow
diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc
index b1b10ca8750..fe9500d40ca 100644
--- a/cpp/src/arrow/json/converter.cc
+++ b/cpp/src/arrow/json/converter.cc
@@ -48,17 +48,17 @@ namespace {
 
 const DictionaryArray& GetDictionaryArray(const std::shared_ptr<Array>& in) {
   DCHECK_EQ(in->type_id(), Type::DICTIONARY);
-  auto dict_type = static_cast<const DictionaryType*>(in->type().get());
+  auto dict_type = checked_cast<const DictionaryType*>(in->type().get());
   DCHECK_EQ(dict_type->index_type()->id(), Type::INT32);
   DCHECK_EQ(dict_type->value_type()->id(), Type::STRING);
-  return static_cast<const DictionaryArray&>(*in);
+  return checked_cast<const DictionaryArray&>(*in);
 }
 
 template <typename ValidVisitor, typename NullVisitor>
 Status VisitDictionaryEntries(const DictionaryArray& dict_array,
                               ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
-  const StringArray& dict = static_cast<const StringArray&>(*dict_array.dictionary());
-  const Int32Array& indices = static_cast<const Int32Array&>(*dict_array.indices());
+  const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
+  const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (indices.IsValid(i)) {
       RETURN_NOT_OK(visit_valid(dict.GetView(indices.GetView(i))));
@@ -281,8 +281,8 @@ const PromotionGraph* GetPromotionGraph() {
           return timestamp(TimeUnit::SECOND);
 
         case Kind::kArray: {
-          auto type = static_cast<const ListType*>(unexpected_field->type().get());
-          auto value_field = type->value_field();
+          const auto& type = checked_cast<const ListType&>(*unexpected_field->type());
+          auto value_field = type.value_field();
           return list(value_field->WithType(Infer(value_field)));
         }
         case Kind::kObject: {
diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc
index a53742aa801..05f155645a6 100644
--- a/cpp/src/arrow/json/parser.cc
+++ b/cpp/src/arrow/json/parser.cc
@@ -33,6 +33,7 @@
 #include "arrow/buffer_builder.h"
 #include "arrow/type.h"
 #include "arrow/util/bitset_stack.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/make_unique.h"
 #include "arrow/util/string_view.h"
@@ -431,7 +432,7 @@ class RawBuilderSet {
 
       case Kind::kArray: {
         RETURN_NOT_OK(MakeBuilder<Kind::kArray>(leading_nulls, builder));
-        const auto& list_type = static_cast<const ListType&>(t);
+        const auto& list_type = checked_cast<const ListType&>(t);
 
         BuilderPtr value_builder;
         RETURN_NOT_OK(MakeBuilder(*list_type.value_type(), 0, &value_builder));
@@ -442,7 +443,7 @@ class RawBuilderSet {
       }
       case Kind::kObject: {
         RETURN_NOT_OK(MakeBuilder<Kind::kObject>(leading_nulls, builder));
-        const auto& struct_type = static_cast<const StructType&>(t);
+        const auto& struct_type = checked_cast<const StructType&>(t);
 
         for (const auto& f : struct_type.fields()) {
           BuilderPtr field_builder;
diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc
index 35340187f07..d9861b385c6 100644
--- a/cpp/src/arrow/json/parser_test.cc
+++ b/cpp/src/arrow/json/parser_test.cc
@@ -28,9 +28,13 @@
 #include "arrow/json/test_common.h"
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/string_view.h"
 
 namespace arrow {
+
+using internal::checked_cast;
+
 namespace json {
 
 using util::string_view;
@@ -46,7 +50,7 @@ void AssertUnconvertedArraysEqual(const Array& expected, const Array& actual) {
     case Type::DICTIONARY: {
       ASSERT_EQ(expected.type_id(), Type::STRING);
       std::shared_ptr<Array> actual_decoded;
-      ASSERT_OK(DecodeStringDictionary(static_cast<const DictionaryArray&>(actual),
+      ASSERT_OK(DecodeStringDictionary(checked_cast<const DictionaryArray&>(actual),
                                        &actual_decoded));
       return AssertArraysEqual(expected, *actual_decoded);
     }
@@ -59,14 +63,15 @@ void AssertUnconvertedArraysEqual(const Array& expected, const Array& actual) {
       const auto& expected_offsets = expected.data()->buffers[1];
       const auto& actual_offsets = actual.data()->buffers[1];
       AssertBufferEqual(*expected_offsets, *actual_offsets);
-      auto expected_values = static_cast<const ListArray&>(expected).values();
-      auto actual_values = static_cast<const ListArray&>(actual).values();
+      auto expected_values = checked_cast<const ListArray&>(expected).values();
+      auto actual_values = checked_cast<const ListArray&>(actual).values();
       return AssertUnconvertedArraysEqual(*expected_values, *actual_values);
     }
     case Type::STRUCT:
       ASSERT_EQ(expected.type_id(), Type::STRUCT);
-      return AssertUnconvertedStructArraysEqual(static_cast<const StructArray&>(expected),
-                                                static_cast<const StructArray&>(actual));
+      return AssertUnconvertedStructArraysEqual(
+          checked_cast<const StructArray&>(expected),
+          checked_cast<const StructArray&>(actual));
     default:
       FAIL();
   }
diff --git a/cpp/src/arrow/json/reader.cc b/cpp/src/arrow/json/reader.cc
index ed57a134e93..51c77fa4df9 100644
--- a/cpp/src/arrow/json/reader.cc
+++ b/cpp/src/arrow/json/reader.cc
@@ -30,6 +30,7 @@
 #include "arrow/record_batch.h"
 #include "arrow/table.h"
 #include "arrow/util/async_generator.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/iterator.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/string_view.h"
@@ -40,6 +41,7 @@ namespace arrow {
 
 using util::string_view;
 
+using internal::checked_cast;
 using internal::GetCpuThreadPool;
 using internal::TaskGroup;
 using internal::ThreadPool;
@@ -211,13 +213,13 @@ Result<std::shared_ptr<RecordBatch>> ParseOne(ParseOptions options,
   builder->Insert(0, field("", type), parsed);
   std::shared_ptr<ChunkedArray> converted_chunked;
   RETURN_NOT_OK(builder->Finish(&converted_chunked));
-  auto converted = static_cast<const StructArray*>(converted_chunked->chunk(0).get());
+  const auto& converted = checked_cast<const StructArray&>(*converted_chunked->chunk(0));
 
-  std::vector<std::shared_ptr<Array>> columns(converted->num_fields());
-  for (int i = 0; i < converted->num_fields(); ++i) {
-    columns[i] = converted->field(i);
+  std::vector<std::shared_ptr<Array>> columns(converted.num_fields());
+  for (int i = 0; i < converted.num_fields(); ++i) {
+    columns[i] = converted.field(i);
   }
-  return RecordBatch::Make(schema(converted->type()->fields()), converted->length(),
+  return RecordBatch::Make(schema(converted.type()->fields()), converted.length(),
                            std::move(columns));
 }
 
diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc
index c19c0bc7da2..976343b5211 100644
--- a/cpp/src/arrow/json/reader_test.cc
+++ b/cpp/src/arrow/json/reader_test.cc
@@ -203,16 +203,6 @@ TEST_P(ReaderTest, MultipleChunks) {
   AssertTablesEqual(*expected_table, *table_);
 }
 
-template <typename T>
-std::string RowsOfOneColumn(string_view name, std::initializer_list<T> values,
-                            decltype(std::to_string(*values.begin()))* = nullptr) {
-  std::stringstream ss;
-  for (auto value : values) {
-    ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
-  }
-  return ss.str();
-}
-
 TEST(ReaderTest, MultipleChunksParallel) {
   int64_t count = 1 << 10;
 
diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
index 125a6e27ed8..618b16ae424 100644
--- a/cpp/src/arrow/json/test_common.h
+++ b/cpp/src/arrow/json/test_common.h
@@ -19,7 +19,9 @@
 
 #include <memory>
 #include <random>
+#include <sstream>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "arrow/json/rapidjson_defs.h"
@@ -34,10 +36,14 @@
 #include "arrow/json/parser.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/string_view.h"
 #include "arrow/visitor_inline.h"
 
 namespace arrow {
+
+using internal::checked_cast;
+
 namespace json {
 
 namespace rj = arrow::rapidjson;
@@ -161,8 +167,8 @@ inline static Status MakeStream(string_view src_str,
 // dictionary<index:int32, value:string>. This can be decoded for ease of comparison
 inline static Status DecodeStringDictionary(const DictionaryArray& dict_array,
                                             std::shared_ptr<Array>* decoded) {
-  const StringArray& dict = static_cast<const StringArray&>(*dict_array.dictionary());
-  const Int32Array& indices = static_cast<const Int32Array&>(*dict_array.indices());
+  const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
+  const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
   StringBuilder builder;
   RETURN_NOT_OK(builder.Resize(indices.length()));
   for (int64_t i = 0; i < indices.length(); ++i) {
@@ -197,6 +203,25 @@ static inline std::string PrettyPrint(string_view one_line) {
   return sb.GetString();
 }
 
+template <typename T>
+std::string RowsOfOneColumn(util::string_view name, std::initializer_list<T> values,
+                            decltype(std::to_string(*values.begin()))* = nullptr) {
+  std::stringstream ss;
+  for (auto value : values) {
+    ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
+  }
+  return ss.str();
+}
+
+inline std::string RowsOfOneColumn(util::string_view name,
+                                   std::initializer_list<std::string> values) {
+  std::stringstream ss;
+  for (auto value : values) {
+    ss << R"({")" << name << R"(":)" << value << "}\n";
+  }
+  return ss.str();
+}
+
 inline static std::string scalars_only_src() {
   return R"(
     { "hello": 3.5, "world": false, "yo": "thing" }
diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc
index 12883f54707..bea35ff3b61 100644
--- a/cpp/src/arrow/python/pyarrow.cc
+++ b/cpp/src/arrow/python/pyarrow.cc
@@ -68,6 +68,8 @@ DEFINE_WRAP_FUNCTIONS(data_type, DataType)
 DEFINE_WRAP_FUNCTIONS(field, Field)
 DEFINE_WRAP_FUNCTIONS(schema, Schema)
 
+DEFINE_WRAP_FUNCTIONS(scalar, Scalar)
+
 DEFINE_WRAP_FUNCTIONS(array, Array)
 DEFINE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
 
diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h
index 3f1358b2e2f..8056e700a0c 100644
--- a/cpp/src/arrow/python/pyarrow.h
+++ b/cpp/src/arrow/python/pyarrow.h
@@ -60,6 +60,8 @@ DECLARE_WRAP_FUNCTIONS(data_type, DataType)
 DECLARE_WRAP_FUNCTIONS(field, Field)
 DECLARE_WRAP_FUNCTIONS(schema, Schema)
 
+DECLARE_WRAP_FUNCTIONS(scalar, Scalar)
+
 DECLARE_WRAP_FUNCTIONS(array, Array)
 DECLARE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
 
diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h
index 95229eb78d5..e45f598019d 100644
--- a/cpp/src/arrow/record_batch.h
+++ b/cpp/src/arrow/record_batch.h
@@ -25,6 +25,7 @@
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type_fwd.h"
+#include "arrow/util/future.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
@@ -87,7 +88,7 @@ class ARROW_EXPORT RecordBatch {
 
   // \return the table's schema
   /// \return true if batches are equal
-  std::shared_ptr<Schema> schema() const { return schema_; }
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
 
   /// \brief Retrieve all columns at once
   std::vector<std::shared_ptr<Array>> columns() const;
@@ -207,6 +208,14 @@ class ARROW_EXPORT RecordBatchReader {
   /// \return Status
   virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0;
 
+  // Fallback to sync implementation until all other readers are converted(ARROW-11770)
+  // and then this could become pure virtual with ReadNext falling back to async impl.
+  virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() {
+    std::shared_ptr<RecordBatch> batch;
+    ARROW_RETURN_NOT_OK(ReadNext(&batch));
+    return Future<std::shared_ptr<RecordBatch>>::MakeFinished(std::move(batch));
+  }
+
   /// \brief Iterator interface
   Result<std::shared_ptr<RecordBatch>> Next() {
     std::shared_ptr<RecordBatch> batch;
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index 399eac675f4..56a36114e49 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -145,7 +145,7 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
-size_t Scalar::Hash::hash(const Scalar& scalar) { return ScalarHashImpl(scalar).hash_; }
+size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; }
 
 StringScalar::StringScalar(std::string s)
     : StringScalar(Buffer::FromString(std::move(s))) {}
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index e84e3fab900..24744859686 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -69,15 +69,15 @@ struct ARROW_EXPORT Scalar : public util::EqualityComparable<Scalar> {
                     const EqualOptions& options = EqualOptions::Defaults()) const;
 
   struct ARROW_EXPORT Hash {
-    size_t operator()(const Scalar& scalar) const { return hash(scalar); }
+    size_t operator()(const Scalar& scalar) const { return scalar.hash(); }
 
     size_t operator()(const std::shared_ptr<Scalar>& scalar) const {
-      return hash(*scalar);
+      return scalar->hash();
     }
-
-    static size_t hash(const Scalar& scalar);
   };
 
+  size_t hash() const;
+
   std::string ToString() const;
 
   static Result<std::shared_ptr<Scalar>> Parse(const std::shared_ptr<DataType>& type,
diff --git a/cpp/src/arrow/testing/future_util.h b/cpp/src/arrow/testing/future_util.h
index 3679c6b918d..44fa78c375c 100644
--- a/cpp/src/arrow/testing/future_util.h
+++ b/cpp/src/arrow/testing/future_util.h
@@ -26,7 +26,7 @@
 // unit test anyways.
 #define ASSERT_FINISHES_IMPL(fut)                            \
   do {                                                       \
-    ASSERT_TRUE(fut.Wait(10));                               \
+    ASSERT_TRUE(fut.Wait(300));                              \
     if (!fut.is_finished()) {                                \
       FAIL() << "Future did not finish in a timely fashion"; \
     }                                                        \
@@ -35,11 +35,11 @@
 #define ASSERT_FINISHES_OK(expr)                                              \
   do {                                                                        \
     auto&& _fut = (expr);                                                     \
-    ASSERT_TRUE(_fut.Wait(10));                                               \
+    ASSERT_TRUE(_fut.Wait(300));                                              \
     if (!_fut.is_finished()) {                                                \
       FAIL() << "Future did not finish in a timely fashion";                  \
     }                                                                         \
-    auto _st = _fut.status();                                                 \
+    auto& _st = _fut.status();                                                \
     if (!_st.ok()) {                                                          \
       FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \
     }                                                                         \
diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc
index 41c1f752160..71fad394d00 100644
--- a/cpp/src/arrow/testing/generator.cc
+++ b/cpp/src/arrow/testing/generator.cc
@@ -95,4 +95,88 @@ std::shared_ptr<arrow::Array> ConstantArrayGenerator::String(int64_t size,
   return ConstantArray<StringType>(size, value);
 }
 
+struct ScalarVectorToArrayImpl {
+  template <typename T, typename AppendScalar,
+            typename BuilderType = typename TypeTraits<T>::BuilderType,
+            typename ScalarType = typename TypeTraits<T>::ScalarType>
+  Status UseBuilder(const AppendScalar& append) {
+    BuilderType builder(type_, default_memory_pool());
+    for (const auto& s : scalars_) {
+      if (s->is_valid) {
+        RETURN_NOT_OK(append(internal::checked_cast<const ScalarType&>(*s), &builder));
+      } else {
+        RETURN_NOT_OK(builder.AppendNull());
+      }
+    }
+    return builder.FinishInternal(&data_);
+  }
+
+  struct AppendValue {
+    template <typename BuilderType, typename ScalarType>
+    Status operator()(const ScalarType& s, BuilderType* builder) const {
+      return builder->Append(s.value);
+    }
+  };
+
+  struct AppendBuffer {
+    template <typename BuilderType, typename ScalarType>
+    Status operator()(const ScalarType& s, BuilderType* builder) const {
+      const Buffer& buffer = *s.value;
+      return builder->Append(util::string_view{buffer});
+    }
+  };
+
+  template <typename T>
+  enable_if_primitive_ctype<T, Status> Visit(const T&) {
+    return UseBuilder<T>(AppendValue{});
+  }
+
+  template <typename T>
+  enable_if_has_string_view<T, Status> Visit(const T&) {
+    return UseBuilder<T>(AppendBuffer{});
+  }
+
+  Status Visit(const StructType& type) {
+    data_ = ArrayData::Make(type_, static_cast<int64_t>(scalars_.size()),
+                            {/*null_bitmap=*/nullptr});
+    data_->child_data.resize(type_->num_fields());
+
+    ScalarVector field_scalars(scalars_.size());
+
+    for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+      for (size_t i = 0; i < scalars_.size(); ++i) {
+        field_scalars[i] =
+            internal::checked_cast<StructScalar*>(scalars_[i].get())->value[field_index];
+      }
+
+      ARROW_ASSIGN_OR_RAISE(data_->child_data[field_index],
+                            ScalarVectorToArrayImpl{}.Convert(field_scalars));
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DataType& type) {
+    return Status::NotImplemented("ScalarVectorToArray for type ", type);
+  }
+
+  Result<std::shared_ptr<ArrayData>> Convert(const ScalarVector& scalars) && {
+    if (scalars.size() == 0) {
+      return Status::NotImplemented("ScalarVectorToArray with no scalars");
+    }
+    scalars_ = std::move(scalars);
+    type_ = scalars_[0]->type;
+    RETURN_NOT_OK(VisitTypeInline(*type_, this));
+    return std::move(data_);
+  }
+
+  std::shared_ptr<DataType> type_;
+  ScalarVector scalars_;
+  std::shared_ptr<ArrayData> data_;
+};
+
+Result<std::shared_ptr<Array>> ScalarVectorToArray(const ScalarVector& scalars) {
+  ARROW_ASSIGN_OR_RAISE(auto data, ScalarVectorToArrayImpl{}.Convert(scalars));
+  return MakeArray(std::move(data));
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/testing/generator.h b/cpp/src/arrow/testing/generator.h
index 9188dca5709..c300022432a 100644
--- a/cpp/src/arrow/testing/generator.h
+++ b/cpp/src/arrow/testing/generator.h
@@ -255,4 +255,7 @@ class ARROW_TESTING_EXPORT ConstantArrayGenerator {
   }
 };
 
+ARROW_TESTING_EXPORT
+Result<std::shared_ptr<Array>> ScalarVectorToArray(const ScalarVector& scalars);
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 462a5237921..67343b0d86a 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -49,8 +49,10 @@
 #include "arrow/table.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/future.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/windows_compatibility.h"
 
 namespace arrow {
 
@@ -131,20 +133,21 @@ void AssertArraysEqualWith(const Array& expected, const Array& actual, bool verb
   }
 }
 
-void AssertArraysEqual(const Array& expected, const Array& actual, bool verbose) {
+void AssertArraysEqual(const Array& expected, const Array& actual, bool verbose,
+                       const EqualOptions& options) {
   return AssertArraysEqualWith(
       expected, actual, verbose,
-      [](const Array& expected, const Array& actual, std::stringstream* diff) {
-        return expected.Equals(actual, EqualOptions().diff_sink(diff));
+      [&](const Array& expected, const Array& actual, std::stringstream* diff) {
+        return expected.Equals(actual, options.diff_sink(diff));
       });
 }
 
 void AssertArraysApproxEqual(const Array& expected, const Array& actual, bool verbose,
-                             const EqualOptions& option) {
+                             const EqualOptions& options) {
   return AssertArraysEqualWith(
       expected, actual, verbose,
-      [&option](const Array& expected, const Array& actual, std::stringstream* diff) {
-        return expected.ApproxEquals(actual, option.diff_sink(diff));
+      [&](const Array& expected, const Array& actual, std::stringstream* diff) {
+        return expected.ApproxEquals(actual, options.diff_sink(diff));
       });
 }
 
@@ -596,6 +599,33 @@ void SleepFor(double seconds) {
       std::chrono::nanoseconds(static_cast<int64_t>(seconds * 1e9)));
 }
 
+#ifdef _WIN32
+void SleepABit() {
+  LARGE_INTEGER freq, start, now;
+  QueryPerformanceFrequency(&freq);
+  // 1 ms
+  auto desired = freq.QuadPart / 1000;
+  if (desired <= 0) {
+    // Fallback to STL sleep if high resolution clock not available, tests may fail,
+    // shouldn't really happen
+    SleepFor(1e-3);
+    return;
+  }
+  QueryPerformanceCounter(&start);
+  while (true) {
+    std::this_thread::yield();
+    QueryPerformanceCounter(&now);
+    auto elapsed = now.QuadPart - start.QuadPart;
+    if (elapsed > desired) {
+      break;
+    }
+  }
+}
+#else
+// std::this_thread::sleep_for should be high enough resolution on non-Windows systems
+void SleepABit() { SleepFor(1e-3); }
+#endif
+
 void BusyWait(double seconds, std::function<bool()> predicate) {
   const double period = 0.001;
   for (int i = 0; !predicate() && i * period < seconds; ++i) {
@@ -603,6 +633,24 @@ void BusyWait(double seconds, std::function<bool()> predicate) {
   }
 }
 
+Future<> SleepAsync(double seconds) {
+  auto out = Future<>::Make();
+  std::thread([out, seconds]() mutable {
+    SleepFor(seconds);
+    out.MarkFinished(Status::OK());
+  }).detach();
+  return out;
+}
+
+Future<> SleepABitAsync() {
+  auto out = Future<>::Make();
+  std::thread([out]() mutable {
+    SleepABit();
+    out.MarkFinished(Status::OK());
+  }).detach();
+  return out;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // Extension types
 
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index 744af0e0f75..0f25ac07767 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -173,10 +173,12 @@ std::vector<Type::type> AllTypeIds();
 
 // If verbose is true, then the arrays will be pretty printed
 ARROW_TESTING_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual,
-                                            bool verbose = false);
-ARROW_TESTING_EXPORT void AssertArraysApproxEqual(
-    const Array& expected, const Array& actual, bool verbose = false,
-    const EqualOptions& option = EqualOptions::Defaults());
+                                            bool verbose = false,
+                                            const EqualOptions& options = {});
+ARROW_TESTING_EXPORT void AssertArraysApproxEqual(const Array& expected,
+                                                  const Array& actual,
+                                                  bool verbose = false,
+                                                  const EqualOptions& options = {});
 // Returns true when values are both null
 ARROW_TESTING_EXPORT void AssertScalarsEqual(
     const Scalar& expected, const Scalar& actual, bool verbose = false,
@@ -434,10 +436,24 @@ inline void BitmapFromVector(const std::vector<T>& is_valid,
 ARROW_TESTING_EXPORT
 void SleepFor(double seconds);
 
+// Sleeps for a very small amount of time.  The thread will be yielded
+// at least once ensuring that context switches could happen.  It is intended
+// to be used for stress testing parallel code and shouldn't be assumed to do any
+// reliable timing.
+ARROW_TESTING_EXPORT
+void SleepABit();
+
 // Wait until predicate is true or timeout in seconds expires.
 ARROW_TESTING_EXPORT
 void BusyWait(double seconds, std::function<bool()> predicate);
 
+ARROW_TESTING_EXPORT
+Future<> SleepAsync(double seconds);
+
+// \see SleepABit
+ARROW_TESTING_EXPORT
+Future<> SleepABitAsync();
+
 template <typename T>
 std::vector<T> IteratorToVector(Iterator<T> iterator) {
   EXPECT_OK_AND_ASSIGN(auto out, iterator.ToVector());
diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc
index 21f7514289d..e6a5723840a 100644
--- a/cpp/src/arrow/testing/json_internal.cc
+++ b/cpp/src/arrow/testing/json_internal.cc
@@ -1231,12 +1231,27 @@ class ArrayReader {
     return FinishBuilder(&builder);
   }
 
+  int64_t ParseOffset(const rj::Value& json_offset) {
+    DCHECK(json_offset.IsInt() || json_offset.IsInt64() || json_offset.IsString());
+
+    if (json_offset.IsInt64()) {
+      return json_offset.GetInt64();
+    } else {
+      return UnboxValue<Int64Type>(json_offset);
+    }
+  }
+
   template <typename T>
   enable_if_base_binary<T, Status> Visit(const T& type) {
     typename TypeTraits<T>::BuilderType builder(pool_);
     using offset_type = typename T::offset_type;
 
     ARROW_ASSIGN_OR_RAISE(const auto json_data_arr, GetDataArray(obj_));
+    ARROW_ASSIGN_OR_RAISE(const auto json_offsets, GetMemberArray(obj_, "OFFSET"));
+    if (static_cast<int32_t>(json_offsets.Size()) != (length_ + 1)) {
+      return Status::Invalid(
+          "JSON OFFSET array size differs from advertised array length + 1");
+    }
 
     for (int i = 0; i < length_; ++i) {
       if (!is_valid_[i]) {
@@ -1246,8 +1261,14 @@ class ArrayReader {
       const rj::Value& val = json_data_arr[i];
       DCHECK(val.IsString());
 
+      int64_t offset_start = ParseOffset(json_offsets[i]);
+      int64_t offset_end = ParseOffset(json_offsets[i + 1]);
+      DCHECK(offset_end >= offset_start);
+
       if (T::is_utf8) {
-        RETURN_NOT_OK(builder.Append(val.GetString()));
+        auto str = val.GetString();
+        DCHECK(std::string(str).size() == static_cast<size_t>(offset_end - offset_start));
+        RETURN_NOT_OK(builder.Append(str));
       } else {
         std::string hex_string = val.GetString();
 
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 168e172bc88..46018ef13be 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -35,6 +35,12 @@ class Result;
 
 class Status;
 
+namespace detail {
+struct Empty;
+}
+template <typename T = detail::Empty>
+class Future;
+
 namespace util {
 class Codec;
 }  // namespace util
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 1f98bead0a4..37987b98520 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -41,6 +41,7 @@ endif()
 add_arrow_test(utility-test
                SOURCES
                align_util_test.cc
+               async_generator_test.cc
                bit_block_counter_test.cc
                bit_util_test.cc
                cache_test.cc
@@ -60,6 +61,7 @@ add_arrow_test(utility-test
                stl_util_test.cc
                string_test.cc
                tdigest_test.cc
+               test_common.cc
                time_test.cc
                trie_test.cc
                uri_test.cc
diff --git a/cpp/src/arrow/util/algorithm.h b/cpp/src/arrow/util/algorithm.h
new file mode 100644
index 00000000000..2a0e6ba709d
--- /dev/null
+++ b/cpp/src/arrow/util/algorithm.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+
+namespace arrow {
+
+template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
+Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
+                      UnaryOperation unary_op) {
+  for (; first != last; ++first, (void)++out) {
+    ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
+  }
+  return Status::OK();
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/async_generator.h b/cpp/src/arrow/util/async_generator.h
index 29285fbd25c..fc58c3d180b 100644
--- a/cpp/src/arrow/util/async_generator.h
+++ b/cpp/src/arrow/util/async_generator.h
@@ -24,7 +24,6 @@
 #include "arrow/util/functional.h"
 #include "arrow/util/future.h"
 #include "arrow/util/iterator.h"
-#include "arrow/util/logging.h"
 #include "arrow/util/mutex.h"
 #include "arrow/util/optional.h"
 #include "arrow/util/queue.h"
@@ -32,20 +31,47 @@
 
 namespace arrow {
 
+// The methods in this file create, modify, and utilize AsyncGenerator which is an
+// iterator of futures.  This allows an asynchronous source (like file input) to be run
+// through a pipeline in the same way that iterators can be used to create pipelined
+// workflows.
+//
+// In order to support pipeline parallelism we introduce the concept of asynchronous
+// reentrancy. This is different than synchronous reentrancy.  With synchronous code a
+// function is reentrant if the function can be called again while a previous call to that
+// function is still running.  Unless otherwise specified none of these generators are
+// synchronously reentrant.  Care should be taken to avoid calling them in such a way (and
+// the utilities Visit/Collect/Await take care to do this).
+//
+// Asynchronous reentrancy on the other hand means the function is called again before the
+// future returned by the function is marekd finished (but after the call to get the
+// future returns).  Some of these generators are async-reentrant while others (e.g.
+// those that depend on ordered processing like decompression) are not.  Read the MakeXYZ
+// function comments to determine which generators support async reentrancy.
+//
+// Note: Generators that are not asynchronously reentrant can still support readahead
+// (\see MakeSerialReadaheadGenerator).
+//
+// Readahead operators, and some other operators, may introduce queueing.  Any operators
+// that introduce buffering should detail the amount of buffering they introduce in their
+// MakeXYZ function comments.
 template <typename T>
 using AsyncGenerator = std::function<Future<T>()>;
 
 template <typename T>
-Future<T> AsyncGeneratorEnd() {
-  return Future<T>::MakeFinished(IterationTraits<T>::End());
-}
+struct IterationTraits<AsyncGenerator<T>> {
+  /// \brief by default when iterating through a sequence of AsyncGenerator<T>,
+  /// an empty function indicates the end of iteration.
+  static AsyncGenerator<T> End() { return AsyncGenerator<T>(); }
+
+  static bool IsEnd(const AsyncGenerator<T>& val) { return !val; }
+};
 
 template <typename T>
-bool IsGeneratorEnd(const T& value) {
-  return value == IterationTraits<T>::End();
+Future<T> AsyncGeneratorEnd() {
+  return Future<T>::MakeFinished(IterationTraits<T>::End());
 }
 
-/// Iterates through a generator of futures, visiting the result of each one and
 /// returning a future that completes when all have been visited
 template <typename T>
 Future<> VisitAsyncGenerator(AsyncGenerator<T> generator,
@@ -53,7 +79,7 @@ Future<> VisitAsyncGenerator(AsyncGenerator<T> generator,
   struct LoopBody {
     struct Callback {
       Result<ControlFlow<detail::Empty>> operator()(const T& result) {
-        if (result == IterationTraits<T>::End()) {
+        if (IsIterationEnd(result)) {
           return Break(detail::Empty());
         } else {
           auto visited = visitor(result);
@@ -81,6 +107,14 @@ Future<> VisitAsyncGenerator(AsyncGenerator<T> generator,
   return Loop(LoopBody{std::move(generator), std::move(visitor)});
 }
 
+/// \brief Waits for an async generator to complete, discarding results.
+template <typename T>
+Future<> DiscardAllFromAsyncGenerator(AsyncGenerator<T> generator) {
+  std::function<Status(T)> visitor = [](...) { return Status::OK(); };
+  return VisitAsyncGenerator(generator, visitor);
+}
+
+/// \brief Collects the results of an async generator into a vector
 template <typename T>
 Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
   auto vec = std::make_shared<std::vector<T>>();
@@ -89,7 +123,7 @@ Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
       auto next = generator_();
       auto vec = vec_;
       return next.Then([vec](const T& result) -> Result<ControlFlow<std::vector<T>>> {
-        if (result == IterationTraits<T>::End()) {
+        if (IsIterationEnd(result)) {
           return Break(*vec);
         } else {
           vec->push_back(result);
@@ -103,6 +137,299 @@ Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
   return Loop(LoopBody{std::move(generator), std::move(vec)});
 }
 
+/// \see MakeMappedGenerator
+template <typename T, typename V>
+class MappingGenerator {
+ public:
+  MappingGenerator(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+      : state_(std::make_shared<State>(std::move(source), std::move(map))) {}
+
+  Future<V> operator()() {
+    auto future = Future<V>::Make();
+    bool should_trigger;
+    {
+      auto guard = state_->mutex.Lock();
+      if (state_->finished) {
+        return AsyncGeneratorEnd<V>();
+      }
+      should_trigger = state_->waiting_jobs.empty();
+      state_->waiting_jobs.push_back(future);
+    }
+    if (should_trigger) {
+      state_->source().AddCallback(Callback{state_});
+    }
+    return future;
+  }
+
+ private:
+  struct State {
+    State(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+        : source(std::move(source)),
+          map(std::move(map)),
+          waiting_jobs(),
+          mutex(),
+          finished(false) {}
+
+    void Purge() {
+      // This might be called by an original callback (if the source iterator fails or
+      // ends) or by a mapped callback (if the map function fails or ends prematurely).
+      // Either way it should only be called once and after finished is set so there is no
+      // need to guard access to `waiting_jobs`.
+      while (!waiting_jobs.empty()) {
+        waiting_jobs.front().MarkFinished(IterationTraits<V>::End());
+        waiting_jobs.pop_front();
+      }
+    }
+
+    AsyncGenerator<T> source;
+    std::function<Future<V>(const T&)> map;
+    std::deque<Future<V>> waiting_jobs;
+    util::Mutex mutex;
+    bool finished;
+  };
+
+  struct Callback;
+
+  struct MappedCallback {
+    void operator()(const Result<V>& maybe_next) {
+      bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+      bool should_purge = false;
+      if (end) {
+        {
+          auto guard = state->mutex.Lock();
+          should_purge = !state->finished;
+          state->finished = true;
+        }
+      }
+      sink.MarkFinished(maybe_next);
+      if (should_purge) {
+        state->Purge();
+      }
+    }
+    std::shared_ptr<State> state;
+    Future<V> sink;
+  };
+
+  struct Callback {
+    void operator()(const Result<T>& maybe_next) {
+      Future<V> sink;
+      bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+      bool should_purge = false;
+      bool should_trigger;
+      {
+        auto guard = state->mutex.Lock();
+        if (end) {
+          should_purge = !state->finished;
+          state->finished = true;
+        }
+        sink = state->waiting_jobs.front();
+        state->waiting_jobs.pop_front();
+        should_trigger = !end && !state->waiting_jobs.empty();
+      }
+      if (should_purge) {
+        state->Purge();
+      }
+      if (should_trigger) {
+        state->source().AddCallback(Callback{state});
+      }
+      if (maybe_next.ok()) {
+        const T& val = maybe_next.ValueUnsafe();
+        if (IsIterationEnd(val)) {
+          sink.MarkFinished(IterationTraits<V>::End());
+        } else {
+          Future<V> mapped_fut = state->map(val);
+          mapped_fut.AddCallback(MappedCallback{std::move(state), std::move(sink)});
+        }
+      } else {
+        sink.MarkFinished(maybe_next.status());
+      }
+    }
+
+    std::shared_ptr<State> state;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that will apply the map function to each element of
+/// source.  The map function is not called on the end token.
+///
+/// Note: This function makes a copy of `map` for each item
+/// Note: Errors returned from the `map` function will be propagated
+///
+/// If the source generator is async-reentrant then this generator will be also
+template <typename T, typename V>
+AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator,
+                                      std::function<Result<V>(const T&)> map) {
+  std::function<Future<V>(const T&)> future_map = [map](const T& val) -> Future<V> {
+    return Future<V>::MakeFinished(map(val));
+  };
+  return MappingGenerator<T, V>(std::move(source_generator), std::move(future_map));
+}
+template <typename T, typename V>
+AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator,
+                                      std::function<V(const T&)> map) {
+  std::function<Future<V>(const T&)> maybe_future_map = [map](const T& val) -> Future<V> {
+    return Future<V>::MakeFinished(map(val));
+  };
+  return MappingGenerator<T, V>(std::move(source_generator), std::move(maybe_future_map));
+}
+template <typename T, typename V>
+AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator,
+                                      std::function<Future<V>(const T&)> map) {
+  return MappingGenerator<T, V>(std::move(source_generator), std::move(map));
+}
+
+/// \see MakeSequencingGenerator
+template <typename T, typename ComesAfter, typename IsNext>
+class SequencingGenerator {
+ public:
+  SequencingGenerator(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next,
+                      T initial_value)
+      : state_(std::make_shared<State>(std::move(source), std::move(compare),
+                                       std::move(is_next), std::move(initial_value))) {}
+
+  Future<T> operator()() {
+    {
+      auto guard = state_->mutex.Lock();
+      // We can send a result immediately if the top of the queue is either an
+      // error or the next item
+      if (!state_->queue.empty() &&
+          (!state_->queue.top().ok() ||
+           state_->is_next(state_->previous_value, *state_->queue.top()))) {
+        auto result = std::move(state_->queue.top());
+        if (result.ok()) {
+          state_->previous_value = *result;
+        }
+        state_->queue.pop();
+        return Future<T>::MakeFinished(result);
+      }
+      if (state_->finished) {
+        return AsyncGeneratorEnd<T>();
+      }
+      // The next item is not in the queue so we will need to wait
+      auto new_waiting_fut = Future<T>::Make();
+      state_->waiting_future = new_waiting_fut;
+      guard.Unlock();
+      state_->source().AddCallback(Callback{state_});
+      return new_waiting_fut;
+    }
+  }
+
+ private:
+  struct WrappedComesAfter {
+    bool operator()(const Result<T>& left, const Result<T>& right) {
+      if (!left.ok() || !right.ok()) {
+        // Should never happen
+        return false;
+      }
+      return compare(*left, *right);
+    }
+    ComesAfter compare;
+  };
+
+  struct State {
+    State(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next, T initial_value)
+        : source(std::move(source)),
+          is_next(std::move(is_next)),
+          previous_value(std::move(initial_value)),
+          waiting_future(),
+          queue(WrappedComesAfter{compare}),
+          finished(false),
+          mutex() {}
+
+    AsyncGenerator<T> source;
+    IsNext is_next;
+    T previous_value;
+    Future<T> waiting_future;
+    std::priority_queue<Result<T>, std::vector<Result<T>>, WrappedComesAfter> queue;
+    bool finished;
+    util::Mutex mutex;
+  };
+
+  class Callback {
+   public:
+    explicit Callback(std::shared_ptr<State> state) : state_(std::move(state)) {}
+
+    void operator()(const Result<T> result) {
+      Future<T> to_deliver;
+      bool finished;
+      {
+        auto guard = state_->mutex.Lock();
+        bool ready_to_deliver = false;
+        if (!result.ok()) {
+          // Clear any cached results
+          while (!state_->queue.empty()) {
+            state_->queue.pop();
+          }
+          ready_to_deliver = true;
+          state_->finished = true;
+        } else if (IsIterationEnd<T>(result.ValueUnsafe())) {
+          ready_to_deliver = state_->queue.empty();
+          state_->finished = true;
+        } else {
+          ready_to_deliver = state_->is_next(state_->previous_value, *result);
+        }
+
+        if (ready_to_deliver && state_->waiting_future.is_valid()) {
+          to_deliver = state_->waiting_future;
+          if (result.ok()) {
+            state_->previous_value = *result;
+          }
+        } else {
+          state_->queue.push(result);
+        }
+        // Capture state_->finished so we can access it outside the mutex
+        finished = state_->finished;
+      }
+      // Must deliver result outside of the mutex
+      if (to_deliver.is_valid()) {
+        to_deliver.MarkFinished(result);
+      } else {
+        // Otherwise, if we didn't get the next item (or a terminal item), we
+        // need to keep looking
+        if (!finished) {
+          state_->source().AddCallback(Callback{state_});
+        }
+      }
+    }
+
+   private:
+    const std::shared_ptr<State> state_;
+  };
+
+  const std::shared_ptr<State> state_;
+};
+
+/// \brief Buffers an AsyncGenerator to return values in sequence order  ComesAfter
+/// and IsNext determine the sequence order.
+///
+/// ComesAfter should be a BinaryPredicate that only returns true if a comes after b
+///
+/// IsNext should be a BinaryPredicate that returns true, given `a` and `b`, only if
+/// `b` follows immediately after `a`.  It should return true given `initial_value` and
+/// `b` if `b` is the first item in the sequence.
+///
+/// This operator will queue unboundedly while waiting for the next item.  It is intended
+/// for jittery sources that might scatter an ordered sequence.  It is NOT intended to
+/// sort.  Using it to try and sort could result in excessive RAM usage.  This generator
+/// will queue up to N blocks where N is the max "out of order"ness of the source.
+///
+/// For example, if the source is 1,6,2,5,4,3 it will queue 3 blocks because 3 is 3
+/// blocks beyond where it belongs.
+///
+/// This generator is not async-reentrant but it consists only of a simple log(n)
+/// insertion into a priority queue.
+template <typename T, typename ComesAfter, typename IsNext>
+AsyncGenerator<T> MakeSequencingGenerator(AsyncGenerator<T> source_generator,
+                                          ComesAfter compare, IsNext is_next,
+                                          T initial_value) {
+  return SequencingGenerator<T, ComesAfter, IsNext>(
+      std::move(source_generator), std::move(compare), std::move(is_next),
+      std::move(initial_value));
+}
+
+/// \see MakeTransformedGenerator
 template <typename T, typename V>
 class TransformingGenerator {
   // The transforming generator state will be referenced as an async generator but will
@@ -128,8 +455,8 @@ class TransformingGenerator {
         }
 
         auto next_fut = generator_();
-        // If finished already, process results immediately inside the loop to avoid stack
-        // overflow
+        // If finished already, process results immediately inside the loop to avoid
+        // stack overflow
         if (next_fut.is_finished()) {
           auto next_result = next_fut.result();
           if (next_result.ok()) {
@@ -157,7 +484,7 @@ class TransformingGenerator {
       if (!finished_ && last_value_.has_value()) {
         ARROW_ASSIGN_OR_RAISE(TransformFlow<V> next, transformer_(*last_value_));
         if (next.ReadyForNext()) {
-          if (*last_value_ == IterationTraits<T>::End()) {
+          if (IsIterationEnd(*last_value_)) {
             finished_ = true;
           }
           last_value_.reset();
@@ -193,6 +520,23 @@ class TransformingGenerator {
   std::shared_ptr<TransformingGeneratorState> state_;
 };
 
+/// \brief Transforms an async generator using a transformer function returning a new
+/// AsyncGenerator
+///
+/// The transform function here behaves exactly the same as the transform function in
+/// MakeTransformedIterator and you can safely use the same transform function to
+/// transform both synchronous and asynchronous streams.
+///
+/// This generator is not async-reentrant
+///
+/// This generator may queue up to 1 instance of T
+template <typename T, typename V>
+AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
+                                           Transformer<T, V> transformer) {
+  return TransformingGenerator<T, V>(generator, transformer);
+}
+
+/// \see MakeSerialReadaheadGenerator
 template <typename T>
 class SerialReadaheadGenerator {
  public:
@@ -233,8 +577,10 @@ class SerialReadaheadGenerator {
         : first_(true),
           source_(std::move(source)),
           finished_(false),
-          spaces_available_(max_readahead),
-          readahead_queue_(max_readahead) {}
+          // There is one extra "space" for the in-flight request
+          spaces_available_(max_readahead + 1),
+          // The SPSC queue has size-1 "usable" slots so we need to overallocate 1
+          readahead_queue_(max_readahead + 1) {}
 
     Status Pump(const std::shared_ptr<State>& self) {
       // Can't do readahead_queue.write(source().Then(Callback{self})) because then the
@@ -277,7 +623,7 @@ class SerialReadaheadGenerator {
         return maybe_next;
       }
       const auto& next = *maybe_next;
-      if (next == IterationTraits<T>::End()) {
+      if (IsIterationEnd(next)) {
         state_->finished_.store(true);
         return maybe_next;
       }
@@ -294,6 +640,21 @@ class SerialReadaheadGenerator {
   std::shared_ptr<State> state_;
 };
 
+/// \brief Creates a generator that will pull from the source into a queue.  Unlike
+/// MakeReadaheadGenerator this will not pull reentrantly from the source.
+///
+/// The source generator does not need to be async-reentrant
+///
+/// This generator is not async-reentrant (even if the source is)
+///
+/// This generator may queue up to max_readahead additional instances of T
+template <typename T>
+AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
+                                               int max_readahead) {
+  return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \see MakeReadaheadGenerator
 template <typename T>
 class ReadaheadGenerator {
  public:
@@ -304,8 +665,7 @@ class ReadaheadGenerator {
       if (!next_result.ok()) {
         finished->store(true);
       } else {
-        const auto& next = *next_result;
-        if (next == IterationTraits<T>::End()) {
+        if (IsIterationEnd(*next_result)) {
           *finished = true;
         }
       }
@@ -449,41 +809,227 @@ class PushGenerator {
 /// The source generator must be async-reentrant
 ///
 /// This generator itself is async-reentrant.
+///
+/// This generator may queue up to max_readahead instances of T
 template <typename T>
 AsyncGenerator<T> MakeReadaheadGenerator(AsyncGenerator<T> source_generator,
                                          int max_readahead) {
   return ReadaheadGenerator<T>(std::move(source_generator), max_readahead);
 }
 
-/// \brief Creates a generator that will pull from the source into a queue.  Unlike
-/// MakeReadaheadGenerator this will not pull reentrantly from the source.
-///
-/// The source generator does not need to be async-reentrant
+/// \brief Creates a generator that will yield finished futures from a vector
 ///
-/// This generator is not async-reentrant (even if the source is)
+/// This generator is async-reentrant
 template <typename T>
-AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
-                                               int max_readahead) {
-  return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+AsyncGenerator<T> MakeVectorGenerator(std::vector<T> vec) {
+  struct State {
+    explicit State(std::vector<T> vec_) : vec(std::move(vec_)), vec_idx(0) {}
+
+    std::vector<T> vec;
+    std::atomic<std::size_t> vec_idx;
+  };
+
+  auto state = std::make_shared<State>(std::move(vec));
+  return [state]() {
+    auto idx = state->vec_idx.fetch_add(1);
+    if (idx >= state->vec.size()) {
+      return AsyncGeneratorEnd<T>();
+    }
+    return Future<T>::MakeFinished(state->vec[idx]);
+  };
 }
 
-/// \brief Transforms an async generator using a transformer function returning a new
-/// AsyncGenerator
+/// \see MakeMergedGenerator
+template <typename T>
+class MergedGenerator {
+ public:
+  explicit MergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+                           int max_subscriptions)
+      : state_(std::make_shared<State>(std::move(source), max_subscriptions)) {}
+
+  Future<T> operator()() {
+    Future<T> waiting_future;
+    std::shared_ptr<DeliveredJob> delivered_job;
+    {
+      auto guard = state_->mutex.Lock();
+      if (!state_->delivered_jobs.empty()) {
+        delivered_job = std::move(state_->delivered_jobs.front());
+        state_->delivered_jobs.pop_front();
+      } else if (state_->finished) {
+        return IterationTraits<T>::End();
+      } else {
+        waiting_future = Future<T>::Make();
+        state_->waiting_jobs.push_back(std::make_shared<Future<T>>(waiting_future));
+      }
+    }
+    if (delivered_job) {
+      // deliverer will be invalid if outer callback encounters an error and delivers a
+      // failed result
+      if (delivered_job->deliverer) {
+        delivered_job->deliverer().AddCallback(
+            InnerCallback{state_, delivered_job->index});
+      }
+      return std::move(delivered_job->value);
+    }
+    if (state_->first) {
+      state_->first = false;
+      for (std::size_t i = 0; i < state_->active_subscriptions.size(); i++) {
+        state_->source().AddCallback(OuterCallback{state_, i});
+      }
+    }
+    return waiting_future;
+  }
+
+ private:
+  struct DeliveredJob {
+    explicit DeliveredJob(AsyncGenerator<T> deliverer_, Result<T> value_,
+                          std::size_t index_)
+        : deliverer(deliverer_), value(std::move(value_)), index(index_) {}
+
+    AsyncGenerator<T> deliverer;
+    Result<T> value;
+    std::size_t index;
+  };
+
+  struct State {
+    State(AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions)
+        : source(std::move(source)),
+          active_subscriptions(max_subscriptions),
+          delivered_jobs(),
+          waiting_jobs(),
+          mutex(),
+          first(true),
+          source_exhausted(false),
+          finished(false),
+          num_active_subscriptions(max_subscriptions) {}
+
+    AsyncGenerator<AsyncGenerator<T>> source;
+    // active_subscriptions and delivered_jobs will be bounded by max_subscriptions
+    std::vector<AsyncGenerator<T>> active_subscriptions;
+    std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
+    // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will provide the
+    // backpressure
+    std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
+    util::Mutex mutex;
+    bool first;
+    bool source_exhausted;
+    bool finished;
+    int num_active_subscriptions;
+  };
+
+  struct InnerCallback {
+    void operator()(const Result<T>& maybe_next) {
+      Future<T> sink;
+      bool sub_finished = maybe_next.ok() && IsIterationEnd(*maybe_next);
+      {
+        auto guard = state->mutex.Lock();
+        if (state->finished) {
+          // We've errored out so just ignore this result and don't keep pumping
+          return;
+        }
+        if (!sub_finished) {
+          if (state->waiting_jobs.empty()) {
+            state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+                state->active_subscriptions[index], maybe_next, index));
+          } else {
+            sink = std::move(*state->waiting_jobs.front());
+            state->waiting_jobs.pop_front();
+          }
+        }
+      }
+      if (sub_finished) {
+        state->source().AddCallback(OuterCallback{state, index});
+      } else if (sink.is_valid()) {
+        sink.MarkFinished(maybe_next);
+        if (maybe_next.ok()) {
+          state->active_subscriptions[index]().AddCallback(*this);
+        }
+      }
+    }
+    std::shared_ptr<State> state;
+    std::size_t index;
+  };
+
+  struct OuterCallback {
+    void operator()(const Result<AsyncGenerator<T>>& maybe_next) {
+      bool should_purge = false;
+      bool should_continue = false;
+      Future<T> error_sink;
+      {
+        auto guard = state->mutex.Lock();
+        if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
+          state->source_exhausted = true;
+          if (!maybe_next.ok() || --state->num_active_subscriptions == 0) {
+            state->finished = true;
+            should_purge = true;
+          }
+          if (!maybe_next.ok()) {
+            if (state->waiting_jobs.empty()) {
+              state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+                  AsyncGenerator<T>(), maybe_next.status(), index));
+            } else {
+              error_sink = std::move(*state->waiting_jobs.front());
+              state->waiting_jobs.pop_front();
+            }
+          }
+        } else {
+          state->active_subscriptions[index] = *maybe_next;
+          should_continue = true;
+        }
+      }
+      if (error_sink.is_valid()) {
+        error_sink.MarkFinished(maybe_next.status());
+      }
+      if (should_continue) {
+        (*maybe_next)().AddCallback(InnerCallback{state, index});
+      } else if (should_purge) {
+        // At this point state->finished has been marked true so no one else
+        // will be interacting with waiting_jobs and we can iterate outside lock
+        while (!state->waiting_jobs.empty()) {
+          state->waiting_jobs.front()->MarkFinished(IterationTraits<T>::End());
+          state->waiting_jobs.pop_front();
+        }
+      }
+    }
+    std::shared_ptr<State> state;
+    std::size_t index;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that takes in a stream of generators and pulls from up to
+/// max_subscriptions at a time
 ///
-/// The transform function here behaves exactly the same as the transform function in
-/// MakeTransformedIterator and you can safely use the same transform function to
-/// transform both synchronous and asynchronous streams.
+/// Note: This may deliver items out of sequence. For example, items from the third
+/// AsyncGenerator generated by the source may be emitted before some items from the first
+/// AsyncGenerator generated by the source.
 ///
-/// This generator is not async-reentrant
-template <typename T, typename V>
-AsyncGenerator<V> MakeAsyncGenerator(AsyncGenerator<T> generator,
-                                     Transformer<T, V> transformer) {
-  return TransformingGenerator<T, V>(generator, transformer);
+/// This generator will pull from source async-reentrantly unless max_subscriptions is 1
+/// This generator will not pull from the individual subscriptions reentrantly.  Add
+/// readahead to the individual subscriptions if that is desired.
+/// This generator is async-reentrant
+///
+/// This generator may queue up to max_subscriptions instances of T
+template <typename T>
+AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+                                      int max_subscriptions) {
+  return MergedGenerator<T>(std::move(source), max_subscriptions);
 }
 
-/// \brief Transfers execution of the generator onto the given executor
+/// \brief Creates a generator that takes in a stream of generators and pulls from each
+/// one in sequence.
 ///
-/// This generator is async-reentrant if the source generator is async-reentrant
+/// This generator is async-reentrant but will never pull from source reentrantly and
+/// will never pull from any subscription reentrantly.
+///
+/// This generator may queue 1 instance of T
+template <typename T>
+AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
+  return MergedGenerator<T>(std::move(source), 1);
+}
+
+/// \see MakeTransferredGenerator
 template <typename T>
 class TransferringGenerator {
  public:
@@ -508,16 +1054,45 @@ class TransferringGenerator {
 ///
 /// Keep in mind that continuations called on an already completed future will
 /// always be run synchronously and so no transfer will happen in that case.
+///
+/// This generator is async reentrant if the source is
+///
+/// This generator will not queue
 template <typename T>
 AsyncGenerator<T> MakeTransferredGenerator(AsyncGenerator<T> source,
                                            internal::Executor* executor) {
   return TransferringGenerator<T>(std::move(source), executor);
 }
+/// \see MakeIteratorGenerator
+template <typename T>
+class IteratorGenerator {
+ public:
+  explicit IteratorGenerator(Iterator<T> it) : it_(std::move(it)) {}
 
-/// \brief Async generator that iterates on an underlying iterator in a
-/// separate executor.
+  Future<T> operator()() { return Future<T>::MakeFinished(it_.Next()); }
+
+ private:
+  Iterator<T> it_;
+};
+
+/// \brief Constructs a generator that yields futures from an iterator.
 ///
-/// This generator is async-reentrant
+/// Note: Do not use this if you can avoid it.  This blocks in an async
+/// context which is a bad idea.  If you're converting sync-I/O to async
+/// then use MakeBackgroundGenerator.  Otherwise, convert the underlying
+/// source to async.  This function is only around until we can conver the
+/// remaining table readers to async.  Once all uses of this generator have
+/// been removed it should be removed(ARROW-11909).
+///
+/// This generator is not async-reentrant
+///
+/// This generator will not queue
+template <typename T>
+AsyncGenerator<T> MakeIteratorGenerator(Iterator<T> it) {
+  return IteratorGenerator<T>(std::move(it));
+}
+
+/// \see MakeBackgroundGenerator
 template <typename T>
 class BackgroundGenerator {
  public:
@@ -552,7 +1127,7 @@ class BackgroundGenerator {
         return IterationTraits<T>::End();
       }
       auto next = it_->Next();
-      if (!next.ok() || *next == IterationTraits<T>::End()) {
+      if (!next.ok() || IsIterationEnd(*next)) {
         *done_ = true;
       }
       return next;
@@ -570,6 +1145,10 @@ class BackgroundGenerator {
 
 /// \brief Creates an AsyncGenerator<T> by iterating over an Iterator<T> on a background
 /// thread
+///
+/// This generator is async-reentrant
+///
+/// This generator will not queue
 template <typename T>
 static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
     Iterator<T> iterator, internal::Executor* io_executor) {
@@ -578,8 +1157,7 @@ static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
   return [background_iterator]() { return (*background_iterator)(); };
 }
 
-/// \brief Converts an AsyncGenerator<T> to an Iterator<T> by blocking until each future
-/// is finished
+/// \see MakeGeneratorIterator
 template <typename T>
 class GeneratorIterator {
  public:
@@ -591,11 +1169,19 @@ class GeneratorIterator {
   AsyncGenerator<T> source_;
 };
 
+/// \brief Converts an AsyncGenerator<T> to an Iterator<T> by blocking until each future
+/// is finished
 template <typename T>
 Result<Iterator<T>> MakeGeneratorIterator(AsyncGenerator<T> source) {
   return Iterator<T>(GeneratorIterator<T>(std::move(source)));
 }
 
+/// \brief Adds readahead to an iterator using a background thread.
+///
+/// Under the hood this is converting the iterator to a generator using
+/// MakeBackgroundGenerator, adding readahead to the converted generator with
+/// MakeReadaheadGenerator, and then converting back to an iterator using
+/// MakeGeneratorIterator.
 template <typename T>
 Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_size) {
   ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1));
diff --git a/cpp/src/arrow/util/async_generator_test.cc b/cpp/src/arrow/util/async_generator_test.cc
new file mode 100644
index 00000000000..a2e87824bf4
--- /dev/null
+++ b/cpp/src/arrow/util/async_generator_test.cc
@@ -0,0 +1,1056 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <random>
+#include <thread>
+#include <unordered_set>
+
+#include "arrow/testing/future_util.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/test_common.h"
+#include "arrow/util/vector.h"
+
+namespace arrow {
+
+template <typename T>
+AsyncGenerator<T> AsyncVectorIt(std::vector<T> v) {
+  return MakeVectorGenerator(std::move(v));
+}
+
+template <typename T>
+AsyncGenerator<T> FailsAt(AsyncGenerator<T> src, int failing_index) {
+  auto index = std::make_shared<std::atomic<int>>(0);
+  return [src, index, failing_index]() {
+    auto idx = index->fetch_add(1);
+    if (idx >= failing_index) {
+      return Future<T>::MakeFinished(Status::Invalid("XYZ"));
+    }
+    return src();
+  };
+}
+
+template <typename T>
+AsyncGenerator<T> SlowdownABit(AsyncGenerator<T> source) {
+  return MakeMappedGenerator<T, T>(std::move(source), [](const T& res) -> Future<T> {
+    return SleepABitAsync().Then(
+        [res](const Result<detail::Empty>& empty) { return res; });
+  });
+}
+
+template <typename T>
+class TrackingGenerator {
+ public:
+  explicit TrackingGenerator(AsyncGenerator<T> source)
+      : state_(std::make_shared<State>(std::move(source))) {}
+
+  Future<T> operator()() {
+    state_->num_read++;
+    return state_->source();
+  }
+
+  int num_read() { return state_->num_read; }
+
+ private:
+  struct State {
+    explicit State(AsyncGenerator<T> source) : source(std::move(source)), num_read(0) {}
+
+    AsyncGenerator<T> source;
+    int num_read;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+// Yields items with a small pause between each one from a background thread
+std::function<Future<TestInt>()> BackgroundAsyncVectorIt(std::vector<TestInt> v,
+                                                         bool sleep = true) {
+  auto pool = internal::GetCpuThreadPool();
+  auto iterator = VectorIt(v);
+  auto slow_iterator = MakeTransformedIterator<TestInt, TestInt>(
+      std::move(iterator), [sleep](TestInt item) -> Result<TransformFlow<TestInt>> {
+        if (sleep) {
+          SleepABit();
+        }
+        return TransformYield(item);
+      });
+
+  EXPECT_OK_AND_ASSIGN(auto background,
+                       MakeBackgroundGenerator<TestInt>(std::move(slow_iterator),
+                                                        internal::GetCpuThreadPool()));
+  return MakeTransferredGenerator(background, pool);
+}
+
+template <typename T>
+void AssertAsyncGeneratorMatch(std::vector<T> expected, AsyncGenerator<T> actual) {
+  auto vec_future = CollectAsyncGenerator(std::move(actual));
+  EXPECT_OK_AND_ASSIGN(auto vec, vec_future.result());
+  EXPECT_EQ(expected, vec);
+}
+
+template <typename T>
+void AssertGeneratorExhausted(AsyncGenerator<T>& gen) {
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto next, gen());
+  ASSERT_TRUE(IsIterationEnd(next));
+}
+
+// --------------------------------------------------------------------
+// Asynchronous iterator tests
+
+template <typename T>
+class ReentrantCheckerGuard;
+
+template <typename T>
+ReentrantCheckerGuard<T> ExpectNotAccessedReentrantly(AsyncGenerator<T>* generator);
+
+template <typename T>
+class ReentrantChecker {
+ public:
+  Future<T> operator()() {
+    if (state_->generated_unfinished_future.load()) {
+      state_->valid.store(false);
+    }
+    state_->generated_unfinished_future.store(true);
+    auto result = state_->source();
+    return result.Then(Callback{state_});
+  }
+
+  bool valid() { return state_->valid.load(); }
+
+ private:
+  explicit ReentrantChecker(AsyncGenerator<T> source)
+      : state_(std::make_shared<State>(std::move(source))) {}
+
+  friend ReentrantCheckerGuard<T> ExpectNotAccessedReentrantly<T>(
+      AsyncGenerator<T>* generator);
+
+  struct State {
+    explicit State(AsyncGenerator<T> source_)
+        : source(std::move(source_)), generated_unfinished_future(false), valid(true) {}
+
+    AsyncGenerator<T> source;
+    std::atomic<bool> generated_unfinished_future;
+    std::atomic<bool> valid;
+  };
+  struct Callback {
+    Future<T> operator()(const Result<T>& result) {
+      state_->generated_unfinished_future.store(false);
+      return result;
+    }
+    std::shared_ptr<State> state_;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+template <typename T>
+class ReentrantCheckerGuard {
+ public:
+  explicit ReentrantCheckerGuard(ReentrantChecker<T> checker) : checker_(checker) {}
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(ReentrantCheckerGuard);
+  ReentrantCheckerGuard(ReentrantCheckerGuard&& other) : checker_(other.checker_) {
+    if (other.owner_) {
+      other.owner_ = false;
+      owner_ = true;
+    } else {
+      owner_ = false;
+    }
+  }
+  ReentrantCheckerGuard& operator=(ReentrantCheckerGuard&& other) {
+    checker_ = other.checker_;
+    if (other.owner_) {
+      other.owner_ = false;
+      owner_ = true;
+    } else {
+      owner_ = false;
+    }
+    return *this;
+  }
+
+  ~ReentrantCheckerGuard() {
+    if (owner_ && !checker_.valid()) {
+      ADD_FAILURE() << "A generator was accessed reentrantly when the test asserted it "
+                       "should not be.";
+    }
+  }
+
+ private:
+  ReentrantChecker<T> checker_;
+  bool owner_ = true;
+};
+
+template <typename T>
+ReentrantCheckerGuard<T> ExpectNotAccessedReentrantly(AsyncGenerator<T>* generator) {
+  auto reentrant_checker = ReentrantChecker<T>(*generator);
+  *generator = reentrant_checker;
+  return ReentrantCheckerGuard<T>(reentrant_checker);
+}
+
+TEST(TestAsyncUtil, Visit) {
+  auto generator = AsyncVectorIt<TestInt>({1, 2, 3});
+  unsigned int sum = 0;
+  auto sum_future = VisitAsyncGenerator<TestInt>(generator, [&sum](TestInt item) {
+    sum += item.value;
+    return Status::OK();
+  });
+  ASSERT_TRUE(sum_future.is_finished());
+  ASSERT_EQ(6, sum);
+}
+
+TEST(TestAsyncUtil, Collect) {
+  std::vector<TestInt> expected = {1, 2, 3};
+  auto generator = AsyncVectorIt(expected);
+  auto collected = CollectAsyncGenerator(generator);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto collected_val, collected);
+  ASSERT_EQ(expected, collected_val);
+}
+
+TEST(TestAsyncUtil, Map) {
+  std::vector<TestInt> input = {1, 2, 3};
+  auto generator = AsyncVectorIt(input);
+  std::function<TestStr(const TestInt&)> mapper = [](const TestInt& in) {
+    return std::to_string(in.value);
+  };
+  auto mapped = MakeMappedGenerator(std::move(generator), mapper);
+  std::vector<TestStr> expected{"1", "2", "3"};
+  AssertAsyncGeneratorMatch(expected, mapped);
+}
+
+TEST(TestAsyncUtil, MapAsync) {
+  std::vector<TestInt> input = {1, 2, 3};
+  auto generator = AsyncVectorIt(input);
+  std::function<Future<TestStr>(const TestInt&)> mapper = [](const TestInt& in) {
+    return SleepAsync(1e-3).Then([in](const Result<detail::Empty>& empty) {
+      return TestStr(std::to_string(in.value));
+    });
+  };
+  auto mapped = MakeMappedGenerator(std::move(generator), mapper);
+  std::vector<TestStr> expected{"1", "2", "3"};
+  AssertAsyncGeneratorMatch(expected, mapped);
+}
+
+TEST(TestAsyncUtil, MapReentrant) {
+  std::vector<TestInt> input = {1, 2};
+  auto source = AsyncVectorIt(input);
+  TrackingGenerator<TestInt> tracker(std::move(source));
+  source = MakeTransferredGenerator(AsyncGenerator<TestInt>(tracker),
+                                    internal::GetCpuThreadPool());
+
+  std::atomic<int> map_tasks_running(0);
+  // Mapper blocks until can_proceed is marked finished, should start multiple map tasks
+  Future<> can_proceed = Future<>::Make();
+  std::function<Future<TestStr>(const TestInt&)> mapper = [&](const TestInt& in) {
+    map_tasks_running.fetch_add(1);
+    return can_proceed.Then([in](...) { return TestStr(std::to_string(in.value)); });
+  };
+  auto mapped = MakeMappedGenerator(std::move(source), mapper);
+
+  EXPECT_EQ(0, tracker.num_read());
+
+  auto one = mapped();
+  auto two = mapped();
+
+  BusyWait(10, [&] { return map_tasks_running.load() == 2; });
+  EXPECT_EQ(2, map_tasks_running.load());
+  EXPECT_EQ(2, tracker.num_read());
+
+  auto end_one = mapped();
+  auto end_two = mapped();
+
+  can_proceed.MarkFinished();
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto oneval, one);
+  EXPECT_EQ("1", oneval.value);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto twoval, two);
+  EXPECT_EQ("2", twoval.value);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto end, end_one);
+  ASSERT_EQ(IterationTraits<TestStr>::End(), end);
+  ASSERT_FINISHES_OK_AND_ASSIGN(end, end_two);
+  ASSERT_EQ(IterationTraits<TestStr>::End(), end);
+}
+
+TEST(TestAsyncUtil, MapParallelStress) {
+  constexpr int NTASKS = 10;
+  constexpr int NITEMS = 10;
+  for (int i = 0; i < NTASKS; i++) {
+    auto gen = MakeVectorGenerator(RangeVector(NITEMS));
+    gen = SlowdownABit(std::move(gen));
+    auto guard = ExpectNotAccessedReentrantly(&gen);
+    std::function<TestStr(const TestInt&)> mapper = [](const TestInt& in) {
+      SleepABit();
+      return std::to_string(in.value);
+    };
+    auto mapped = MakeMappedGenerator(std::move(gen), mapper);
+    mapped = MakeReadaheadGenerator(mapped, 8);
+    ASSERT_FINISHES_OK_AND_ASSIGN(auto collected, CollectAsyncGenerator(mapped));
+    ASSERT_EQ(NITEMS, collected.size());
+  }
+}
+
+TEST(TestAsyncUtil, MapTaskFail) {
+  std::vector<TestInt> input = {1, 2, 3};
+  auto generator = AsyncVectorIt(input);
+  std::function<Result<TestStr>(const TestInt&)> mapper =
+      [](const TestInt& in) -> Result<TestStr> {
+    if (in.value == 2) {
+      return Status::Invalid("XYZ");
+    }
+    return TestStr(std::to_string(in.value));
+  };
+  auto mapped = MakeMappedGenerator(std::move(generator), mapper);
+  ASSERT_FINISHES_AND_RAISES(Invalid, CollectAsyncGenerator(mapped));
+}
+
+TEST(TestAsyncUtil, MapSourceFail) {
+  std::vector<TestInt> input = {1, 2, 3};
+  auto generator = FailsAt(AsyncVectorIt(input), 1);
+  std::function<Result<TestStr>(const TestInt&)> mapper =
+      [](const TestInt& in) -> Result<TestStr> {
+    return TestStr(std::to_string(in.value));
+  };
+  auto mapped = MakeMappedGenerator(std::move(generator), mapper);
+  ASSERT_FINISHES_AND_RAISES(Invalid, CollectAsyncGenerator(mapped));
+}
+
+TEST(TestAsyncUtil, Concatenated) {
+  std::vector<TestInt> inputOne{1, 2, 3};
+  std::vector<TestInt> inputTwo{4, 5, 6};
+  std::vector<TestInt> expected{1, 2, 3, 4, 5, 6};
+  auto gen = AsyncVectorIt<AsyncGenerator<TestInt>>(
+      {AsyncVectorIt<TestInt>(inputOne), AsyncVectorIt<TestInt>(inputTwo)});
+  auto concat = MakeConcatenatedGenerator(gen);
+  AssertAsyncGeneratorMatch(expected, concat);
+}
+
+class GeneratorTestFixture : public ::testing::TestWithParam<bool> {
+ protected:
+  AsyncGenerator<TestInt> MakeSource(const std::vector<TestInt>& items) {
+    std::vector<TestInt> wrapped(items.begin(), items.end());
+    auto gen = AsyncVectorIt(std::move(wrapped));
+    bool slow = GetParam();
+    if (slow) {
+      return SlowdownABit(std::move(gen));
+    }
+    return gen;
+  }
+
+  AsyncGenerator<TestInt> MakeFailingSource() {
+    AsyncGenerator<TestInt> gen = [] {
+      return Future<TestInt>::MakeFinished(Status::Invalid("XYZ"));
+    };
+    bool slow = GetParam();
+    if (slow) {
+      return SlowdownABit(std::move(gen));
+    }
+    return gen;
+  }
+
+  int GetNumItersForStress() {
+    bool slow = GetParam();
+    // Run fewer trials for the slow case since they take longer
+    if (slow) {
+      return 10;
+    } else {
+      return 100;
+    }
+  }
+};
+
+TEST_P(GeneratorTestFixture, Merged) {
+  auto gen = AsyncVectorIt<AsyncGenerator<TestInt>>(
+      {MakeSource({1, 2, 3}), MakeSource({4, 5, 6})});
+
+  auto concat_gen = MakeMergedGenerator(gen, 10);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto concat, CollectAsyncGenerator(concat_gen));
+  auto concat_ints =
+      internal::MapVector([](const TestInt& val) { return val.value; }, concat);
+  std::set<int> concat_set(concat_ints.begin(), concat_ints.end());
+
+  std::set<int> expected{1, 2, 4, 3, 5, 6};
+  ASSERT_EQ(expected, concat_set);
+}
+
+TEST_P(GeneratorTestFixture, MergedInnerFail) {
+  auto gen = AsyncVectorIt<AsyncGenerator<TestInt>>(
+      {MakeSource({1, 2, 3}), MakeFailingSource()});
+  auto merged_gen = MakeMergedGenerator(gen, 10);
+  ASSERT_FINISHES_AND_RAISES(Invalid, CollectAsyncGenerator(merged_gen));
+}
+
+TEST_P(GeneratorTestFixture, MergedOuterFail) {
+  auto gen =
+      FailsAt(AsyncVectorIt<AsyncGenerator<TestInt>>(
+                  {MakeSource({1, 2, 3}), MakeSource({1, 2, 3}), MakeSource({1, 2, 3})}),
+              1);
+  auto merged_gen = MakeMergedGenerator(gen, 10);
+  ASSERT_FINISHES_AND_RAISES(Invalid, CollectAsyncGenerator(merged_gen));
+}
+
+TEST_P(GeneratorTestFixture, MergedLimitedSubscriptions) {
+  auto gen = AsyncVectorIt<AsyncGenerator<TestInt>>(
+      {MakeSource({1, 2}), MakeSource({3, 4}), MakeSource({5, 6, 7, 8}),
+       MakeSource({9, 10, 11, 12})});
+  TrackingGenerator<AsyncGenerator<TestInt>> tracker(std::move(gen));
+  auto merged = MakeMergedGenerator(AsyncGenerator<AsyncGenerator<TestInt>>(tracker), 2);
+
+  SleepABit();
+  // Lazy pull, should not start until first pull
+  ASSERT_EQ(0, tracker.num_read());
+
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto next, merged());
+  ASSERT_TRUE(next.value == 1 || next.value == 3);
+
+  // First 2 values have to come from one of the first 2 sources
+  ASSERT_EQ(2, tracker.num_read());
+  ASSERT_FINISHES_OK_AND_ASSIGN(next, merged());
+  ASSERT_LT(next.value, 5);
+  ASSERT_GT(next.value, 0);
+
+  // By the time five values have been read we should have exhausted at
+  // least one source
+  for (int i = 0; i < 3; i++) {
+    ASSERT_FINISHES_OK_AND_ASSIGN(next, merged());
+    // 9 is possible if we read 1,2,3,4 and then grab 9 while 5 is running slow
+    ASSERT_LT(next.value, 10);
+    ASSERT_GT(next.value, 0);
+  }
+  ASSERT_GT(tracker.num_read(), 2);
+  ASSERT_LT(tracker.num_read(), 5);
+
+  // Read remaining values
+  for (int i = 0; i < 7; i++) {
+    ASSERT_FINISHES_OK_AND_ASSIGN(next, merged());
+    ASSERT_LT(next.value, 13);
+    ASSERT_GT(next.value, 0);
+  }
+
+  AssertGeneratorExhausted(merged);
+}
+
+TEST_P(GeneratorTestFixture, MergedStress) {
+  constexpr int NGENERATORS = 10;
+  constexpr int NITEMS = 10;
+  for (int i = 0; i < GetNumItersForStress(); i++) {
+    std::vector<AsyncGenerator<TestInt>> sources;
+    std::vector<ReentrantCheckerGuard<TestInt>> guards;
+    for (int j = 0; j < NGENERATORS; j++) {
+      auto source = MakeSource(RangeVector(NITEMS));
+      guards.push_back(ExpectNotAccessedReentrantly(&source));
+      sources.push_back(source);
+    }
+    AsyncGenerator<AsyncGenerator<TestInt>> source_gen = AsyncVectorIt(sources);
+
+    auto merged = MakeMergedGenerator(source_gen, 4);
+    ASSERT_FINISHES_OK_AND_ASSIGN(auto items, CollectAsyncGenerator(merged));
+    ASSERT_EQ(NITEMS * NGENERATORS, items.size());
+  }
+}
+
+TEST_P(GeneratorTestFixture, MergedParallelStress) {
+  constexpr int NGENERATORS = 10;
+  constexpr int NITEMS = 10;
+  for (int i = 0; i < GetNumItersForStress(); i++) {
+    std::vector<AsyncGenerator<TestInt>> sources;
+    for (int j = 0; j < NGENERATORS; j++) {
+      sources.push_back(MakeSource(RangeVector(NITEMS)));
+    }
+    auto merged = MakeMergedGenerator(AsyncVectorIt(sources), 4);
+    merged = MakeReadaheadGenerator(merged, 4);
+    ASSERT_FINISHES_OK_AND_ASSIGN(auto items, CollectAsyncGenerator(merged));
+    ASSERT_EQ(NITEMS * NGENERATORS, items.size());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(GeneratorTests, GeneratorTestFixture,
+                         ::testing::Values(false, true));
+
+TEST(TestAsyncUtil, FromVector) {
+  AsyncGenerator<TestInt> gen;
+  {
+    std::vector<TestInt> input = {1, 2, 3};
+    gen = MakeVectorGenerator(std::move(input));
+  }
+  std::vector<TestInt> expected = {1, 2, 3};
+  AssertAsyncGeneratorMatch(expected, gen);
+}
+
+TEST(TestAsyncUtil, SynchronousFinish) {
+  AsyncGenerator<TestInt> generator = []() {
+    return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
+  };
+  Transformer<TestInt, TestStr> skip_all = [](TestInt value) { return TransformSkip(); };
+  auto transformed = MakeTransformedGenerator(generator, skip_all);
+  auto future = CollectAsyncGenerator(transformed);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto actual, future);
+  ASSERT_EQ(std::vector<TestStr>(), actual);
+}
+
+TEST(TestAsyncUtil, GeneratorIterator) {
+  auto generator = BackgroundAsyncVectorIt({1, 2, 3});
+  ASSERT_OK_AND_ASSIGN(auto iterator, MakeGeneratorIterator(std::move(generator)));
+  ASSERT_OK_AND_EQ(TestInt(1), iterator.Next());
+  ASSERT_OK_AND_EQ(TestInt(2), iterator.Next());
+  ASSERT_OK_AND_EQ(TestInt(3), iterator.Next());
+  AssertIteratorExhausted(iterator);
+  AssertIteratorExhausted(iterator);
+}
+
+TEST(TestAsyncUtil, MakeTransferredGenerator) {
+  std::mutex mutex;
+  std::condition_variable cv;
+  std::atomic<bool> finished(false);
+
+  ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(1));
+
+  // Needs to be a slow source to ensure we don't call Then on a completed
+  AsyncGenerator<TestInt> slow_generator = [&]() {
+    return thread_pool
+        ->Submit([&] {
+          std::unique_lock<std::mutex> lock(mutex);
+          cv.wait_for(lock, std::chrono::duration<double>(30),
+                      [&] { return finished.load(); });
+          return IterationTraits<TestInt>::End();
+        })
+        .ValueOrDie();
+  };
+
+  auto transferred =
+      MakeTransferredGenerator<TestInt>(std::move(slow_generator), thread_pool.get());
+
+  auto current_thread_id = std::this_thread::get_id();
+  auto fut = transferred().Then([&current_thread_id](const Result<TestInt>& result) {
+    ASSERT_NE(current_thread_id, std::this_thread::get_id());
+  });
+
+  {
+    std::lock_guard<std::mutex> lg(mutex);
+    finished.store(true);
+  }
+  cv.notify_one();
+  ASSERT_FINISHES_OK(fut);
+}
+
+// This test is too slow for valgrind
+#if !(defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER))
+
+TEST(TestAsyncUtil, StackOverflow) {
+  int counter = 0;
+  AsyncGenerator<TestInt> generator = [&counter]() {
+    if (counter < 10000) {
+      return Future<TestInt>::MakeFinished(counter++);
+    } else {
+      return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
+    }
+  };
+  Transformer<TestInt, TestStr> discard =
+      [](TestInt next) -> Result<TransformFlow<TestStr>> { return TransformSkip(); };
+  auto transformed = MakeTransformedGenerator(generator, discard);
+  auto collected_future = CollectAsyncGenerator(transformed);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto collected, collected_future);
+  ASSERT_EQ(0, collected.size());
+}
+
+#endif
+
+TEST(TestAsyncUtil, Background) {
+  std::vector<TestInt> expected = {1, 2, 3};
+  auto background = BackgroundAsyncVectorIt(expected);
+  auto future = CollectAsyncGenerator(background);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto collected, future);
+  ASSERT_EQ(expected, collected);
+}
+
+struct SlowEmptyIterator {
+  Result<TestInt> Next() {
+    if (called_) {
+      return Status::Invalid("Should not have been called twice");
+    }
+    SleepFor(0.1);
+    return IterationTraits<TestInt>::End();
+  }
+
+ private:
+  bool called_ = false;
+};
+
+TEST(TestAsyncUtil, BackgroundRepeatEnd) {
+  // Ensure that the background generator properly fulfills the asyncgenerator contract
+  // and can be called after it ends.
+  ASSERT_OK_AND_ASSIGN(auto io_pool, internal::ThreadPool::Make(1));
+
+  auto iterator = Iterator<TestInt>(SlowEmptyIterator());
+  ASSERT_OK_AND_ASSIGN(auto background_gen,
+                       MakeBackgroundGenerator(std::move(iterator), io_pool.get()));
+
+  background_gen =
+      MakeTransferredGenerator(std::move(background_gen), internal::GetCpuThreadPool());
+
+  auto one = background_gen();
+  auto two = background_gen();
+
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto one_fin, one);
+  ASSERT_TRUE(IsIterationEnd(one_fin));
+
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto two_fin, two);
+  ASSERT_TRUE(IsIterationEnd(two_fin));
+}
+
+TEST(TestAsyncUtil, CompleteBackgroundStressTest) {
+  auto expected = RangeVector(20);
+  std::vector<Future<std::vector<TestInt>>> futures;
+  for (unsigned int i = 0; i < 20; i++) {
+    auto background = BackgroundAsyncVectorIt(expected);
+    futures.push_back(CollectAsyncGenerator(background));
+  }
+  auto combined = All(futures);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto completed_vectors, combined);
+  for (std::size_t i = 0; i < completed_vectors.size(); i++) {
+    ASSERT_OK_AND_ASSIGN(auto vector, completed_vectors[i]);
+    ASSERT_EQ(vector, expected);
+  }
+}
+
+TEST(TestAsyncUtil, SerialReadaheadSlowProducer) {
+  AsyncGenerator<TestInt> gen = BackgroundAsyncVectorIt({1, 2, 3, 4, 5});
+  auto guard = ExpectNotAccessedReentrantly(&gen);
+  SerialReadaheadGenerator<TestInt> serial_readahead(gen, 2);
+  AssertAsyncGeneratorMatch({1, 2, 3, 4, 5},
+                            static_cast<AsyncGenerator<TestInt>>(serial_readahead));
+}
+
+TEST(TestAsyncUtil, SerialReadaheadSlowConsumer) {
+  int num_delivered = 0;
+  auto source = [&num_delivered]() {
+    if (num_delivered < 5) {
+      return Future<TestInt>::MakeFinished(num_delivered++);
+    } else {
+      return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
+    }
+  };
+  AsyncGenerator<TestInt> serial_readahead = SerialReadaheadGenerator<TestInt>(source, 3);
+  SleepABit();
+  ASSERT_EQ(0, num_delivered);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto next, serial_readahead());
+  ASSERT_EQ(0, next.value);
+  ASSERT_EQ(4, num_delivered);
+  AssertAsyncGeneratorMatch({1, 2, 3, 4}, serial_readahead);
+
+  // Ensure still reads ahead with just 1 slot
+  num_delivered = 0;
+  serial_readahead = SerialReadaheadGenerator<TestInt>(source, 1);
+  ASSERT_FINISHES_OK_AND_ASSIGN(next, serial_readahead());
+  ASSERT_EQ(0, next.value);
+  ASSERT_EQ(2, num_delivered);
+  AssertAsyncGeneratorMatch({1, 2, 3, 4}, serial_readahead);
+}
+
+TEST(TestAsyncUtil, SerialReadaheadStress) {
+  constexpr int NTASKS = 20;
+  constexpr int NITEMS = 50;
+  for (int i = 0; i < NTASKS; i++) {
+    AsyncGenerator<TestInt> gen = BackgroundAsyncVectorIt(RangeVector(NITEMS));
+    auto guard = ExpectNotAccessedReentrantly(&gen);
+    SerialReadaheadGenerator<TestInt> serial_readahead(gen, 2);
+    auto visit_fut =
+        VisitAsyncGenerator<TestInt>(serial_readahead, [](TestInt test_int) -> Status {
+          // Normally sleeping in a visit function would be a faux-pas but we want to slow
+          // the reader down to match the producer to maximize the stress
+          SleepABit();
+          return Status::OK();
+        });
+    ASSERT_FINISHES_OK(visit_fut);
+  }
+}
+
+TEST(TestAsyncUtil, SerialReadaheadStressFast) {
+  constexpr int NTASKS = 20;
+  constexpr int NITEMS = 50;
+  for (int i = 0; i < NTASKS; i++) {
+    AsyncGenerator<TestInt> gen = BackgroundAsyncVectorIt(RangeVector(NITEMS), false);
+    auto guard = ExpectNotAccessedReentrantly(&gen);
+    SerialReadaheadGenerator<TestInt> serial_readahead(gen, 2);
+    auto visit_fut = VisitAsyncGenerator<TestInt>(
+        serial_readahead, [](TestInt test_int) -> Status { return Status::OK(); });
+    ASSERT_FINISHES_OK(visit_fut);
+  }
+}
+
+TEST(TestAsyncUtil, SerialReadaheadStressFailing) {
+  constexpr int NTASKS = 20;
+  constexpr int NITEMS = 50;
+  constexpr int EXPECTED_SUM = 45;
+  for (int i = 0; i < NTASKS; i++) {
+    AsyncGenerator<TestInt> it = BackgroundAsyncVectorIt(RangeVector(NITEMS));
+    AsyncGenerator<TestInt> fails_at_ten = [&it]() {
+      auto next = it();
+      return next.Then([](const Result<TestInt>& item) -> Result<TestInt> {
+        if (item->value >= 10) {
+          return Status::Invalid("XYZ");
+        } else {
+          return item;
+        }
+      });
+    };
+    SerialReadaheadGenerator<TestInt> serial_readahead(fails_at_ten, 2);
+    unsigned int sum = 0;
+    auto visit_fut = VisitAsyncGenerator<TestInt>(serial_readahead,
+                                                  [&sum](TestInt test_int) -> Status {
+                                                    sum += test_int.value;
+                                                    // Sleep to maximize stress
+                                                    SleepABit();
+                                                    return Status::OK();
+                                                  });
+    ASSERT_FINISHES_AND_RAISES(Invalid, visit_fut);
+    ASSERT_EQ(EXPECTED_SUM, sum);
+  }
+}
+
+TEST(TestAsyncUtil, Readahead) {
+  int num_delivered = 0;
+  auto source = [&num_delivered]() {
+    if (num_delivered < 5) {
+      return Future<TestInt>::MakeFinished(num_delivered++);
+    } else {
+      return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
+    }
+  };
+  auto readahead = MakeReadaheadGenerator<TestInt>(source, 10);
+  // Should not pump until first item requested
+  ASSERT_EQ(0, num_delivered);
+
+  auto first = readahead();
+  // At this point the pumping should have happened
+  ASSERT_EQ(5, num_delivered);
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto first_val, first);
+  ASSERT_EQ(TestInt(0), first_val);
+
+  // Read the rest
+  for (int i = 0; i < 4; i++) {
+    auto next = readahead();
+    ASSERT_FINISHES_OK_AND_ASSIGN(auto next_val, next);
+    ASSERT_EQ(TestInt(i + 1), next_val);
+  }
+
+  // Next should be end
+  auto last = readahead();
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto last_val, last);
+  ASSERT_TRUE(IsIterationEnd(last_val));
+}
+
+TEST(TestAsyncUtil, ReadaheadFailed) {
+  ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(4));
+  std::atomic<int32_t> counter(0);
+  // All tasks are a little slow.  The first task fails.
+  // The readahead will have spawned 9 more tasks and they
+  // should all pass
+  auto source = [thread_pool, &counter]() -> Future<TestInt> {
+    auto count = counter++;
+    return *thread_pool->Submit([count]() -> Result<TestInt> {
+      if (count == 0) {
+        return Status::Invalid("X");
+      }
+      return TestInt(count);
+    });
+  };
+  auto readahead = MakeReadaheadGenerator<TestInt>(source, 10);
+  ASSERT_FINISHES_AND_RAISES(Invalid, readahead());
+  SleepABit();
+
+  for (int i = 0; i < 9; i++) {
+    ASSERT_FINISHES_OK_AND_ASSIGN(auto next_val, readahead());
+    ASSERT_EQ(TestInt(i + 1), next_val);
+  }
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto after, readahead());
+
+  // It's possible that finished was set quickly and there
+  // are only 10 elements
+  if (IsIterationEnd(after)) {
+    return;
+  }
+
+  // It's also possible that finished was too slow and there
+  // ended up being 11 elements
+  ASSERT_EQ(TestInt(10), after);
+  // There can't be 12 elements because SleepABit will prevent it
+  ASSERT_FINISHES_OK_AND_ASSIGN(auto definitely_last, readahead());
+  ASSERT_TRUE(IsIterationEnd(definitely_last));
+}
+
+class SequencerTestFixture : public GeneratorTestFixture {
+ protected:
+  void RandomShuffle(std::vector<TestInt>& values) {
+    std::default_random_engine gen(seed_++);
+    std::shuffle(values.begin(), values.end(), gen);
+  }
+
+  int seed_ = 42;
+  std::function<bool(const TestInt&, const TestInt&)> cmp_ =
+      [](const TestInt& left, const TestInt& right) { return left.value > right.value; };
+  // Let's increment by 2's to make it interesting
+  std::function<bool(const TestInt&, const TestInt&)> is_next_ =
+      [](const TestInt& left, const TestInt& right) {
+        return left.value + 2 == right.value;
+      };
+};
+
+TEST_P(SequencerTestFixture, SequenceBasic) {
+  // Basic sequencing
+  auto original = MakeSource({6, 4, 2});
+  auto sequenced = MakeSequencingGenerator(original, cmp_, is_next_, TestInt(0));
+  AssertAsyncGeneratorMatch({2, 4, 6}, sequenced);
+
+  // From ordered input
+  original = MakeSource({2, 4, 6});
+  sequenced = MakeSequencingGenerator(original, cmp_, is_next_, TestInt(0));
+  AssertAsyncGeneratorMatch({2, 4, 6}, sequenced);
+}
+
+TEST_P(SequencerTestFixture, SequenceLambda) {
+  auto cmp = [](const TestInt& left, const TestInt& right) {
+    return left.value > right.value;
+  };
+  auto is_next = [](const TestInt& left, const TestInt& right) {
+    return left.value + 2 == right.value;
+  };
+  // Basic sequencing
+  auto original = MakeSource({6, 4, 2});
+  auto sequenced = MakeSequencingGenerator(original, cmp, is_next, TestInt(0));
+  AssertAsyncGeneratorMatch({2, 4, 6}, sequenced);
+}
+
+TEST_P(SequencerTestFixture, SequenceError) {
+  {
+    auto original = MakeSource({6, 4, 2});
+    original = FailsAt(original, 1);
+    auto sequenced = MakeSequencingGenerator(original, cmp_, is_next_, TestInt(0));
+    auto collected = CollectAsyncGenerator(sequenced);
+    ASSERT_FINISHES_AND_RAISES(Invalid, collected);
+  }
+  {
+    // Failure should clear old items out of the queue immediately
+    // shared_ptr versions of cmp_ and is_next_
+    auto cmp = cmp_;
+    std::function<bool(const std::shared_ptr<TestInt>&, const std::shared_ptr<TestInt>&)>
+        ptr_cmp =
+            [cmp](const std::shared_ptr<TestInt>& left,
+                  const std::shared_ptr<TestInt>& right) { return cmp(*left, *right); };
+    auto is_next = is_next_;
+    std::function<bool(const std::shared_ptr<TestInt>&, const std::shared_ptr<TestInt>&)>
+        ptr_is_next = [is_next](const std::shared_ptr<TestInt>& left,
+                                const std::shared_ptr<TestInt>& right) {
+          return is_next(*left, *right);
+        };
+
+    PushGenerator<std::shared_ptr<TestInt>> source;
+    auto sequenced = MakeSequencingGenerator(
+        static_cast<AsyncGenerator<std::shared_ptr<TestInt>>>(source), ptr_cmp,
+        ptr_is_next, std::make_shared<TestInt>(0));
+
+    auto should_be_cleared = std::make_shared<TestInt>(4);
+    std::weak_ptr<TestInt> ref = should_be_cleared;
+    auto producer = source.producer();
+    auto next_fut = sequenced();
+    producer.Push(std::move(should_be_cleared));
+    producer.Push(Status::Invalid("XYZ"));
+    ASSERT_TRUE(ref.expired());
+
+    ASSERT_FINISHES_AND_RAISES(Invalid, next_fut);
+  }
+  {
+    // Failure should interrupt pumping
+    PushGenerator<TestInt> source;
+    auto sequenced = MakeSequencingGenerator(static_cast<AsyncGenerator<TestInt>>(source),
+                                             cmp_, is_next_, TestInt(0));
+
+    auto producer = source.producer();
+    auto next_fut = sequenced();
+    producer.Push(TestInt(4));
+    producer.Push(Status::Invalid("XYZ"));
+    producer.Push(TestInt(2));
+    ASSERT_FINISHES_AND_RAISES(Invalid, next_fut);
+    // The sequencer should not have pulled the 2 out of the source because it should
+    // have stopped pumping on error
+    ASSERT_FINISHES_OK_AND_EQ(TestInt(2), source());
+  }
+}
+
+TEST_P(SequencerTestFixture, SequenceStress) {
+  constexpr int NITEMS = 100;
+  for (auto task_index = 0; task_index < GetNumItersForStress(); task_index++) {
+    auto input = RangeVector(NITEMS, 2);
+    RandomShuffle(input);
+    auto original = MakeSource(input);
+    auto sequenced = MakeSequencingGenerator(original, cmp_, is_next_, TestInt(-2));
+    AssertAsyncGeneratorMatch(RangeVector(NITEMS, 2), sequenced);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(SequencerTests, SequencerTestFixture,
+                         ::testing::Values(false, true));
+
+TEST(TestAsyncIteratorTransform, SkipSome) {
+  auto original = AsyncVectorIt<TestInt>({1, 2, 3});
+  auto filter = MakeFilter([](TestInt& t) { return t.value != 2; });
+  auto filtered = MakeTransformedGenerator(std::move(original), filter);
+  AssertAsyncGeneratorMatch({"1", "3"}, std::move(filtered));
+}
+
+TEST(PushGenerator, Empty) {
+  PushGenerator<TestInt> gen;
+  auto producer = gen.producer();
+
+  auto fut = gen();
+  AssertNotFinished(fut);
+  producer.Close();
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), fut);
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
+
+  // Close idempotent
+  fut = gen();
+  producer.Close();
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), fut);
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
+}
+
+TEST(PushGenerator, Success) {
+  PushGenerator<TestInt> gen;
+  auto producer = gen.producer();
+  std::vector<Future<TestInt>> futures;
+
+  producer.Push(TestInt{1});
+  producer.Push(TestInt{2});
+  for (int i = 0; i < 3; ++i) {
+    futures.push_back(gen());
+  }
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{1}, futures[0]);
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{2}, futures[1]);
+  AssertNotFinished(futures[2]);
+
+  producer.Push(TestInt{3});
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{3}, futures[2]);
+  producer.Push(TestInt{4});
+  futures.push_back(gen());
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{4}, futures[3]);
+  producer.Push(TestInt{5});
+  producer.Close();
+  for (int i = 0; i < 4; ++i) {
+    futures.push_back(gen());
+  }
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{5}, futures[4]);
+  for (int i = 5; i < 8; ++i) {
+    ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), futures[i]);
+  }
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
+}
+
+TEST(PushGenerator, Errors) {
+  PushGenerator<TestInt> gen;
+  auto producer = gen.producer();
+  std::vector<Future<TestInt>> futures;
+
+  producer.Push(TestInt{1});
+  producer.Push(Status::Invalid("2"));
+  for (int i = 0; i < 3; ++i) {
+    futures.push_back(gen());
+  }
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{1}, futures[0]);
+  ASSERT_FINISHES_AND_RAISES(Invalid, futures[1]);
+  AssertNotFinished(futures[2]);
+
+  producer.Push(Status::IOError("3"));
+  producer.Push(TestInt{4});
+  ASSERT_FINISHES_AND_RAISES(IOError, futures[2]);
+  futures.push_back(gen());
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{4}, futures[3]);
+  producer.Close();
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
+}
+
+TEST(PushGenerator, CloseEarly) {
+  PushGenerator<TestInt> gen;
+  auto producer = gen.producer();
+  std::vector<Future<TestInt>> futures;
+
+  producer.Push(TestInt{1});
+  producer.Push(TestInt{2});
+  for (int i = 0; i < 3; ++i) {
+    futures.push_back(gen());
+  }
+  producer.Close();
+  producer.Push(TestInt{3});
+
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{1}, futures[0]);
+  ASSERT_FINISHES_OK_AND_EQ(TestInt{2}, futures[1]);
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), futures[2]);
+  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
+}
+
+TEST(PushGenerator, Stress) {
+  const int NTHREADS = 20;
+  const int NVALUES = 2000;
+  const int NFUTURES = NVALUES + 100;
+
+  PushGenerator<TestInt> gen;
+  auto producer = gen.producer();
+
+  std::atomic<int> next_value{0};
+
+  auto producer_worker = [&]() {
+    while (true) {
+      int v = next_value.fetch_add(1);
+      if (v >= NVALUES) {
+        break;
+      }
+      producer.Push(v);
+    }
+  };
+
+  auto producer_main = [&]() {
+    std::vector<std::thread> threads;
+    for (int i = 0; i < NTHREADS; ++i) {
+      threads.emplace_back(producer_worker);
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    producer.Close();
+  };
+
+  std::vector<Result<TestInt>> results;
+  std::thread thread(producer_main);
+  for (int i = 0; i < NFUTURES; ++i) {
+    results.push_back(gen().result());
+  }
+  thread.join();
+
+  std::unordered_set<int> seen_values;
+  for (int i = 0; i < NVALUES; ++i) {
+    ASSERT_OK_AND_ASSIGN(auto v, results[i]);
+    ASSERT_EQ(seen_values.count(v.value), 0);
+    seen_values.insert(v.value);
+  }
+  for (int i = NVALUES; i < NFUTURES; ++i) {
+    ASSERT_OK_AND_EQ(IterationTraits<TestInt>::End(), results[i]);
+  }
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc
index d9d6f4f42fa..56809f28165 100644
--- a/cpp/src/arrow/util/basic_decimal.cc
+++ b/cpp/src/arrow/util/basic_decimal.cc
@@ -227,6 +227,113 @@ static const BasicDecimal256 ScaleMultipliersDecimal256[] = {
     BasicDecimal256(
         {0ULL, 8607968719199866880ULL, 532749306367912313ULL, 1593091911132452277ULL})};
 
+static const BasicDecimal256 ScaleMultipliersHalfDecimal256[] = {
+    BasicDecimal256({0ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({50ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({500ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({50000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({500000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({50000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({500000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({50000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({500000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5000000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({50000000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({500000000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({50000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({500000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+    BasicDecimal256({13106511852580896768ULL, 2ULL, 0ULL, 0ULL}),
+    BasicDecimal256({1937910009842106368ULL, 27ULL, 0ULL, 0ULL}),
+    BasicDecimal256({932356024711512064ULL, 271ULL, 0ULL, 0ULL}),
+    BasicDecimal256({9323560247115120640ULL, 2710ULL, 0ULL, 0ULL}),
+    BasicDecimal256({1001882102603448320ULL, 27105ULL, 0ULL, 0ULL}),
+    BasicDecimal256({10018821026034483200ULL, 271050ULL, 0ULL, 0ULL}),
+    BasicDecimal256({7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL}),
+    BasicDecimal256({5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL}),
+    BasicDecimal256({2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL}),
+    BasicDecimal256({3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL}),
+    BasicDecimal256({2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL}),
+    BasicDecimal256({6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL}),
+    BasicDecimal256({14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL}),
+    BasicDecimal256({11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL}),
+    BasicDecimal256({2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL}),
+    BasicDecimal256({1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL}),
+    BasicDecimal256({15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL}),
+    BasicDecimal256({9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL}),
+    BasicDecimal256({343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL}),
+    BasicDecimal256({3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL}),
+    BasicDecimal256({15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL}),
+    BasicDecimal256({11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL}),
+    BasicDecimal256({5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL}),
+    BasicDecimal256({3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, 0ULL}),
+    BasicDecimal256({40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL}),
+    BasicDecimal256({401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, 0ULL}),
+    BasicDecimal256({4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, 0ULL}),
+    BasicDecimal256(
+        {3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, 0ULL}),
+    BasicDecimal256(
+        {13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, 0ULL}),
+    BasicDecimal256(
+        {8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, 0ULL}),
+    BasicDecimal256(
+        {15758658046122786816ULL, 14498006295086537342ULL, 146936793852ULL, 0ULL}),
+    BasicDecimal256(
+        {10012627871551455232ULL, 15852854434898512116ULL, 1469367938527ULL, 0ULL}),
+    BasicDecimal256(
+        {7892558346966794240ULL, 10954591759308708237ULL, 14693679385278ULL, 0ULL}),
+    BasicDecimal256(
+        {5138607174829735936ULL, 17312197224539324294ULL, 146936793852785ULL, 0ULL}),
+    BasicDecimal256(
+        {14492583600878256128ULL, 7101275582007278398ULL, 1469367938527859ULL, 0ULL}),
+    BasicDecimal256(
+        {15798627492815699968ULL, 15672523598944129139ULL, 14693679385278593ULL, 0ULL}),
+    BasicDecimal256(
+        {10412322338480586752ULL, 9151283399764878470ULL, 146936793852785938ULL, 0ULL}),
+    BasicDecimal256(
+        {11889503016258109440ULL, 17725857702810578241ULL, 1469367938527859384ULL, 0ULL}),
+    BasicDecimal256(
+        {8214565720323784704ULL, 11237880364719817872ULL, 14693679385278593849ULL, 0ULL}),
+    BasicDecimal256(
+        {8358680908399640576ULL, 1698339204940869028ULL, 17809585336819077184ULL, 7ULL}),
+    BasicDecimal256({9799832789158199296ULL, 16983392049408690284ULL,
+                     12075156704804807296ULL, 79ULL}),
+    BasicDecimal256({5764607523034234880ULL, 3813223830700938301ULL,
+                     10071102605790763273ULL, 796ULL}),
+    BasicDecimal256({2305843009213693952ULL, 1238750159590279781ULL,
+                     8477305689359874652ULL, 7965ULL}),
+    BasicDecimal256({4611686018427387904ULL, 12387501595902797811ULL,
+                     10986080598760540056ULL, 79654ULL}),
+    BasicDecimal256({9223372036854775808ULL, 13194551516770668416ULL,
+                     17627085619057642486ULL, 796545ULL}),
+    BasicDecimal256({0ULL, 2818306651739822853ULL, 10250159527190460323ULL, 7965459ULL}),
+    BasicDecimal256({0ULL, 9736322443688676914ULL, 10267874903356845151ULL, 79654595ULL}),
+    BasicDecimal256(
+        {0ULL, 5129504068339011060ULL, 10445028665020693435ULL, 796545955ULL}),
+    BasicDecimal256(
+        {0ULL, 14401552535971007368ULL, 12216566281659176272ULL, 7965459555ULL}),
+    BasicDecimal256(
+        {0ULL, 14888316843743212368ULL, 11485198374334453031ULL, 79654595556ULL}),
+    BasicDecimal256(
+        {0ULL, 1309215847755710752ULL, 4171519301087220622ULL, 796545955566ULL}),
+    BasicDecimal256(
+        {0ULL, 13092158477557107520ULL, 4821704863453102988ULL, 7965459555662ULL}),
+    BasicDecimal256(
+        {0ULL, 1794376259604213888ULL, 11323560487111926655ULL, 79654595556622ULL}),
+    BasicDecimal256(
+        {0ULL, 17943762596042138880ULL, 2555140428861956854ULL, 796545955566226ULL}),
+    BasicDecimal256(
+        {0ULL, 13416929297035424256ULL, 7104660214910016933ULL, 7965459555662261ULL}),
+    BasicDecimal256(
+        {0ULL, 5042084454387381248ULL, 15706369927971514489ULL, 79654595556622613ULL}),
+    BasicDecimal256(
+        {0ULL, 13527356396454709248ULL, 9489746690038731964ULL, 796545955566226138ULL})};
+
 #ifdef ARROW_USE_NATIVE_INT128
 static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF;
 #else
@@ -1060,6 +1167,11 @@ BasicDecimal256& BasicDecimal256::operator+=(const BasicDecimal256& right) {
   return *this;
 }
 
+BasicDecimal256& BasicDecimal256::operator-=(const BasicDecimal256& right) {
+  *this += -right;
+  return *this;
+}
+
 BasicDecimal256& BasicDecimal256::operator<<=(uint32_t bits) {
   if (bits == 0) {
     return *this;
@@ -1136,6 +1248,39 @@ DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale
   return DecimalRescale(*this, original_scale, new_scale, out);
 }
 
+BasicDecimal256 BasicDecimal256::IncreaseScaleBy(int32_t increase_by) const {
+  DCHECK_GE(increase_by, 0);
+  DCHECK_LE(increase_by, 76);
+
+  return (*this) * ScaleMultipliersDecimal256[increase_by];
+}
+
+BasicDecimal256 BasicDecimal256::ReduceScaleBy(int32_t reduce_by, bool round) const {
+  DCHECK_GE(reduce_by, 0);
+  DCHECK_LE(reduce_by, 76);
+
+  if (reduce_by == 0) {
+    return *this;
+  }
+
+  BasicDecimal256 divisor(ScaleMultipliersDecimal256[reduce_by]);
+  BasicDecimal256 result;
+  BasicDecimal256 remainder;
+  auto s = Divide(divisor, &result, &remainder);
+  DCHECK_EQ(s, DecimalStatus::kSuccess);
+  if (round) {
+    auto divisor_half = ScaleMultipliersHalfDecimal256[reduce_by];
+    if (remainder.Abs() >= divisor_half) {
+      if (result > 0) {
+        result += 1;
+      } else {
+        result -= 1;
+      }
+    }
+  }
+  return result;
+}
+
 bool BasicDecimal256::FitsInPrecision(int32_t precision) const {
   DCHECK_GT(precision, 0);
   DCHECK_LE(precision, 76);
diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h
index b62d8942077..d2e37db0cc1 100644
--- a/cpp/src/arrow/util/basic_decimal.h
+++ b/cpp/src/arrow/util/basic_decimal.h
@@ -227,6 +227,9 @@ class ARROW_EXPORT BasicDecimal256 {
   /// \brief Add a number to this one. The result is truncated to 256 bits.
   BasicDecimal256& operator+=(const BasicDecimal256& right);
 
+  /// \brief Subtract a number from this one. The result is truncated to 256 bits.
+  BasicDecimal256& operator-=(const BasicDecimal256& right);
+
   /// \brief Get the bits of the two's complement representation of the number. The 4
   /// elements are in little endian order. The bits within each uint64_t element are in
   /// native endian order. For example,
@@ -237,6 +240,9 @@ class ARROW_EXPORT BasicDecimal256 {
     return little_endian_array_;
   }
 
+  /// \brief Get the lowest bits of the two's complement representation of the number.
+  inline constexpr uint64_t low_bits() const { return little_endian_array_[0]; }
+
   /// \brief Return the raw bytes of the value in native-endian byte order.
   std::array<uint8_t, 32> ToBytes() const;
   void ToBytes(uint8_t* out) const;
@@ -244,10 +250,20 @@ class ARROW_EXPORT BasicDecimal256 {
   /// \brief Scale multiplier for given scale value.
   static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
 
-  /// \brief Convert BasicDecimal128 from one scale to another
+  /// \brief Convert BasicDecimal256 from one scale to another
   DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
                         BasicDecimal256* out) const;
 
+  /// \brief Scale up.
+  BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
+
+  /// \brief Scale down.
+  /// - If 'round' is true, the right-most digits are dropped and the result value is
+  ///   rounded up (+1 for positive, -1 for negative) based on the value of the
+  ///   dropped digits (>= 10^reduce_by / 2).
+  /// - If 'round' is false, the right-most digits are simply dropped.
+  BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
   /// \brief Whether this number fits in the given precision
   ///
   /// Return true if the number of significant digits is less or equal to `precision`.
diff --git a/cpp/src/arrow/util/bpacking_avx2_codegen.py b/cpp/src/arrow/util/bpacking_avx2_codegen.py
index ec4084c1a44..e60aed86a29 100644
--- a/cpp/src/arrow/util/bpacking_avx2_codegen.py
+++ b/cpp/src/arrow/util/bpacking_avx2_codegen.py
@@ -30,6 +30,7 @@ def print_unpack_bit_func(bit):
 
     print(
         f"inline static const uint32_t* unpack{bit}_32_avx2(const uint32_t* in, uint32_t* out) {bracket}")
+    print("  using ::arrow::util::SafeLoad;")
     print("  uint32_t mask = 0x%x;" % mask)
     print("  __m256i reg_shifts, reg_inls, reg_masks;")
     print("  __m256i results;")
@@ -38,18 +39,18 @@ def print_unpack_bit_func(bit):
     for i in range(32):
         if shift + bit == 32:
             shifts.append(shift)
-            inls.append(f"in[{in_index}]")
+            inls.append(f"SafeLoad(in + {in_index})")
             in_index += 1
             shift = 0
         elif shift + bit > 32:  # cross the boundary
             inls.append(
-                f"in[{in_index}] >> {shift} | in[{in_index + 1}] << {32 - shift}")
+                f"SafeLoad(in + {in_index}) >> {shift} | SafeLoad(in + {in_index + 1}) << {32 - shift}")
             in_index += 1
             shift = bit - (32 - shift)
             shifts.append(0)  # zero shift
         else:
             shifts.append(shift)
-            inls.append(f"in[{in_index}]")
+            inls.append(f"SafeLoad(in + {in_index})")
             shift += bit
 
     print("  reg_masks = _mm256_set1_epi32(mask);")
@@ -59,11 +60,11 @@ def print_unpack_bit_func(bit):
     print(
         f"  reg_shifts = _mm256_set_epi32({shifts[7]}, {shifts[6]}, {shifts[5]}, {shifts[4]},")
     print(
-        f"                               {shifts[3]}, {shifts[2]}, {shifts[1]}, {shifts[0]});")
+        f"                                {shifts[3]}, {shifts[2]}, {shifts[1]}, {shifts[0]});")
     print(f"  reg_inls = _mm256_set_epi32({inls[7]}, {inls[6]},")
-    print(f"                             {inls[5]}, {inls[4]},")
-    print(f"                             {inls[3]}, {inls[2]},")
-    print(f"                             {inls[1]}, {inls[0]});")
+    print(f"                              {inls[5]}, {inls[4]},")
+    print(f"                              {inls[3]}, {inls[2]},")
+    print(f"                              {inls[1]}, {inls[0]});")
     print(
         "  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);")
     print("  _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);")
@@ -182,6 +183,8 @@ def main():
     print("#include <immintrin.h>")
     print("#endif")
     print("")
+    print('#include "arrow/util/ubsan.h"')
+    print("")
     print("namespace arrow {")
     print("namespace internal {")
     print("")
diff --git a/cpp/src/arrow/util/bpacking_avx2_generated.h b/cpp/src/arrow/util/bpacking_avx2_generated.h
index b2733e346e0..2240143b16d 100644
--- a/cpp/src/arrow/util/bpacking_avx2_generated.h
+++ b/cpp/src/arrow/util/bpacking_avx2_generated.h
@@ -28,10 +28,12 @@
 #include <immintrin.h>
 #endif
 
+#include "arrow/util/ubsan.h"
+
 namespace arrow {
 namespace internal {
 
-inline const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {
+inline static const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {
   memset(out, 0x0, 32 * sizeof(*out));
   out += 32;
 
@@ -39,6 +41,7 @@ inline const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {
 }
 
 inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -47,11 +50,11 @@ inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(7, 6, 5, 4,
-                               3, 2, 1, 0);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0]);
+                                3, 2, 1, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -59,10 +62,10 @@ inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(15, 14, 13, 12,
                                 11, 10, 9, 8);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -70,10 +73,10 @@ inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(23, 22, 21, 20,
                                 19, 18, 17, 16);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -81,10 +84,10 @@ inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(31, 30, 29, 28,
                                 27, 26, 25, 24);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -95,6 +98,7 @@ inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -103,11 +107,11 @@ inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
-                               6, 4, 2, 0);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0]);
+                                6, 4, 2, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -115,10 +119,10 @@ inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
                                 22, 20, 18, 16);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -126,10 +130,10 @@ inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
                                 6, 4, 2, 0);
-  reg_inls = _mm256_set_epi32(in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -137,10 +141,10 @@ inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
                                 22, 20, 18, 16);
-  reg_inls = _mm256_set_epi32(in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -151,6 +155,7 @@ inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -159,11 +164,11 @@ inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(21, 18, 15, 12,
-                               9, 6, 3, 0);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0]);
+                                9, 6, 3, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -171,10 +176,10 @@ inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(13, 10, 7, 4,
                                 1, 0, 27, 24);
-  reg_inls = _mm256_set_epi32(in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[0] >> 30 | in[1] << 2,
-                              in[0], in[0]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -182,10 +187,10 @@ inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(5, 2, 0, 28,
                                 25, 22, 19, 16);
-  reg_inls = _mm256_set_epi32(in[2], in[2],
-                              in[1] >> 31 | in[2] << 1, in[1],
-                              in[1], in[1],
-                              in[1], in[1]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -193,10 +198,10 @@ inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(29, 26, 23, 20,
                                 17, 14, 11, 8);
-  reg_inls = _mm256_set_epi32(in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -207,6 +212,7 @@ inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xf;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -215,11 +221,11 @@ inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
-                               12, 8, 4, 0);
-  reg_inls = _mm256_set_epi32(in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0]);
+                                12, 8, 4, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -227,10 +233,10 @@ inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
                                 12, 8, 4, 0);
-  reg_inls = _mm256_set_epi32(in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -238,10 +244,10 @@ inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
                                 12, 8, 4, 0);
-  reg_inls = _mm256_set_epi32(in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -249,10 +255,10 @@ inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
                                 12, 8, 4, 0);
-  reg_inls = _mm256_set_epi32(in[3], in[3],
-                              in[3], in[3],
-                              in[3], in[3],
-                              in[3], in[3]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -263,6 +269,7 @@ inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1f;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -271,11 +278,11 @@ inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(3, 0, 25, 20,
-                               15, 10, 5, 0);
-  reg_inls = _mm256_set_epi32(in[1], in[0] >> 30 | in[1] << 2,
-                             in[0], in[0],
-                             in[0], in[0],
-                             in[0], in[0]);
+                                15, 10, 5, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2,
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -283,10 +290,10 @@ inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(11, 6, 1, 0,
                                 23, 18, 13, 8);
-  reg_inls = _mm256_set_epi32(in[2], in[2],
-                              in[2], in[1] >> 28 | in[2] << 4,
-                              in[1], in[1],
-                              in[1], in[1]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -294,10 +301,10 @@ inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(19, 14, 9, 4,
                                 0, 26, 21, 16);
-  reg_inls = _mm256_set_epi32(in[3], in[3],
-                              in[3], in[3],
-                              in[2] >> 31 | in[3] << 1, in[2],
-                              in[2], in[2]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -305,10 +312,10 @@ inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(27, 22, 17, 12,
                                 7, 2, 0, 24);
-  reg_inls = _mm256_set_epi32(in[4], in[4],
-                              in[4], in[4],
-                              in[4], in[4],
-                              in[3] >> 29 | in[4] << 3, in[3]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 3));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -319,6 +326,7 @@ inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3f;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -327,11 +335,11 @@ inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
-                               18, 12, 6, 0);
-  reg_inls = _mm256_set_epi32(in[1], in[1],
-                             in[0] >> 30 | in[1] << 2, in[0],
-                             in[0], in[0],
-                             in[0], in[0]);
+                                18, 12, 6, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -339,10 +347,10 @@ inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
                                 2, 0, 22, 16);
-  reg_inls = _mm256_set_epi32(in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[1] >> 28 | in[2] << 4,
-                              in[1], in[1]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -350,10 +358,10 @@ inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
                                 18, 12, 6, 0);
-  reg_inls = _mm256_set_epi32(in[4], in[4],
-                              in[3] >> 30 | in[4] << 2, in[3],
-                              in[3], in[3],
-                              in[3], in[3]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -361,10 +369,10 @@ inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
                                 2, 0, 22, 16);
-  reg_inls = _mm256_set_epi32(in[5], in[5],
-                              in[5], in[5],
-                              in[5], in[4] >> 28 | in[5] << 4,
-                              in[4], in[4]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4,
+                              SafeLoad(in + 4), SafeLoad(in + 4));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -375,6 +383,7 @@ inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7f;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -383,11 +392,11 @@ inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(17, 10, 3, 0,
-                               21, 14, 7, 0);
-  reg_inls = _mm256_set_epi32(in[1], in[1],
-                             in[1], in[0] >> 28 | in[1] << 4,
-                             in[0], in[0],
-                             in[0], in[0]);
+                                21, 14, 7, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4,
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -395,10 +404,10 @@ inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(9, 2, 0, 20,
                                 13, 6, 0, 24);
-  reg_inls = _mm256_set_epi32(in[3], in[3],
-                              in[2] >> 27 | in[3] << 5, in[2],
-                              in[2], in[2],
-                              in[1] >> 31 | in[2] << 1, in[1]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -406,10 +415,10 @@ inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(1, 0, 19, 12,
                                 5, 0, 23, 16);
-  reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
-                              in[4], in[4],
-                              in[4], in[3] >> 30 | in[4] << 2,
-                              in[3], in[3]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6,
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2,
+                              SafeLoad(in + 3), SafeLoad(in + 3));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -417,10 +426,10 @@ inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(25, 18, 11, 4,
                                 0, 22, 15, 8);
-  reg_inls = _mm256_set_epi32(in[6], in[6],
-                              in[6], in[6],
-                              in[5] >> 29 | in[6] << 3, in[5],
-                              in[5], in[5]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -431,6 +440,7 @@ inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -439,11 +449,11 @@ inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
-                               24, 16, 8, 0);
-  reg_inls = _mm256_set_epi32(in[1], in[1],
-                             in[1], in[1],
-                             in[0], in[0],
-                             in[0], in[0]);
+                                24, 16, 8, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -451,10 +461,10 @@ inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
                                 24, 16, 8, 0);
-  reg_inls = _mm256_set_epi32(in[3], in[3],
-                              in[3], in[3],
-                              in[2], in[2],
-                              in[2], in[2]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -462,10 +472,10 @@ inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
                                 24, 16, 8, 0);
-  reg_inls = _mm256_set_epi32(in[5], in[5],
-                              in[5], in[5],
-                              in[4], in[4],
-                              in[4], in[4]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 4));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -473,10 +483,10 @@ inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
                                 24, 16, 8, 0);
-  reg_inls = _mm256_set_epi32(in[7], in[7],
-                              in[7], in[7],
-                              in[6], in[6],
-                              in[6], in[6]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 6), SafeLoad(in + 6));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -487,6 +497,7 @@ inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1ff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -495,11 +506,11 @@ inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out)
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 22, 13, 4,
-                               0, 18, 9, 0);
-  reg_inls = _mm256_set_epi32(in[1] >> 31 | in[2] << 1, in[1],
-                             in[1], in[1],
-                             in[0] >> 27 | in[1] << 5, in[0],
-                             in[0], in[0]);
+                                0, 18, 9, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -507,10 +518,10 @@ inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(7, 0, 21, 12,
                                 3, 0, 17, 8);
-  reg_inls = _mm256_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
-                              in[3], in[3],
-                              in[3], in[2] >> 26 | in[3] << 6,
-                              in[2], in[2]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2,
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6,
+                              SafeLoad(in + 2), SafeLoad(in + 2));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -518,10 +529,10 @@ inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(15, 6, 0, 20,
                                 11, 2, 0, 16);
-  reg_inls = _mm256_set_epi32(in[6], in[6],
-                              in[5] >> 29 | in[6] << 3, in[5],
-                              in[5], in[5],
-                              in[4] >> 25 | in[5] << 7, in[4]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 4));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -529,10 +540,10 @@ inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out)
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(23, 14, 5, 0,
                                 19, 10, 1, 0);
-  reg_inls = _mm256_set_epi32(in[8], in[8],
-                              in[8], in[7] >> 28 | in[8] << 4,
-                              in[7], in[7],
-                              in[7], in[6] >> 24 | in[7] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -543,6 +554,7 @@ inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out)
 }
 
 inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3ff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -551,11 +563,11 @@ inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
-                               0, 20, 10, 0);
-  reg_inls = _mm256_set_epi32(in[2], in[1] >> 28 | in[2] << 4,
-                             in[1], in[1],
-                             in[0] >> 30 | in[1] << 2, in[0],
-                             in[0], in[0]);
+                                0, 20, 10, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -563,10 +575,10 @@ inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
                                 14, 4, 0, 16);
-  reg_inls = _mm256_set_epi32(in[4], in[4],
-                              in[4], in[3] >> 24 | in[4] << 8,
-                              in[3], in[3],
-                              in[2] >> 26 | in[3] << 6, in[2]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 2));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -574,10 +586,10 @@ inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
                                 0, 20, 10, 0);
-  reg_inls = _mm256_set_epi32(in[7], in[6] >> 28 | in[7] << 4,
-                              in[6], in[6],
-                              in[5] >> 30 | in[6] << 2, in[5],
-                              in[5], in[5]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 7), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4,
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -585,10 +597,10 @@ inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
                                 14, 4, 0, 16);
-  reg_inls = _mm256_set_epi32(in[9], in[9],
-                              in[9], in[8] >> 24 | in[9] << 8,
-                              in[8], in[8],
-                              in[7] >> 26 | in[8] << 6, in[7]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 9), SafeLoad(in + 9),
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 7));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -599,6 +611,7 @@ inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7ff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -607,11 +620,11 @@ inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(13, 2, 0, 12,
-                               1, 0, 11, 0);
-  reg_inls = _mm256_set_epi32(in[2], in[2],
-                             in[1] >> 23 | in[2] << 9, in[1],
-                             in[1], in[0] >> 22 | in[1] << 10,
-                             in[0], in[0]);
+                                1, 0, 11, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -619,10 +632,10 @@ inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(5, 0, 15, 4,
                                 0, 14, 3, 0);
-  reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
-                              in[4], in[4],
-                              in[3] >> 25 | in[4] << 7, in[3],
-                              in[3], in[2] >> 24 | in[3] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6,
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7, SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -630,10 +643,10 @@ inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 18, 7, 0,
                                 17, 6, 0, 16);
-  reg_inls = _mm256_set_epi32(in[7] >> 29 | in[8] << 3, in[7],
-                              in[7], in[6] >> 28 | in[7] << 4,
-                              in[6], in[6],
-                              in[5] >> 27 | in[6] << 5, in[5]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4,
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 5));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -641,10 +654,10 @@ inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(21, 10, 0, 20,
                                 9, 0, 19, 8);
-  reg_inls = _mm256_set_epi32(in[10], in[10],
-                              in[9] >> 31 | in[10] << 1, in[9],
-                              in[9], in[8] >> 30 | in[9] << 2,
-                              in[8], in[8]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 10), SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 9),
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2,
+                              SafeLoad(in + 8), SafeLoad(in + 8));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -655,6 +668,7 @@ inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xfff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -663,11 +677,11 @@ inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
-                               4, 0, 12, 0);
-  reg_inls = _mm256_set_epi32(in[2], in[2],
-                             in[1] >> 28 | in[2] << 4, in[1],
-                             in[1], in[0] >> 24 | in[1] << 8,
-                             in[0], in[0]);
+                                4, 0, 12, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -675,10 +689,10 @@ inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
                                 4, 0, 12, 0);
-  reg_inls = _mm256_set_epi32(in[5], in[5],
-                              in[4] >> 28 | in[5] << 4, in[4],
-                              in[4], in[3] >> 24 | in[4] << 8,
-                              in[3], in[3]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 3), SafeLoad(in + 3));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -686,10 +700,10 @@ inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
                                 4, 0, 12, 0);
-  reg_inls = _mm256_set_epi32(in[8], in[8],
-                              in[7] >> 28 | in[8] << 4, in[7],
-                              in[7], in[6] >> 24 | in[7] << 8,
-                              in[6], in[6]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8,
+                              SafeLoad(in + 6), SafeLoad(in + 6));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -697,10 +711,10 @@ inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
                                 4, 0, 12, 0);
-  reg_inls = _mm256_set_epi32(in[11], in[11],
-                              in[10] >> 28 | in[11] << 4, in[10],
-                              in[10], in[9] >> 24 | in[10] << 8,
-                              in[9], in[9]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 11), SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 10),
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8,
+                              SafeLoad(in + 9), SafeLoad(in + 9));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -711,6 +725,7 @@ inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1fff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -719,11 +734,11 @@ inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 14, 1, 0,
-                               7, 0, 13, 0);
-  reg_inls = _mm256_set_epi32(in[2] >> 27 | in[3] << 5, in[2],
-                             in[2], in[1] >> 20 | in[2] << 12,
-                             in[1], in[0] >> 26 | in[1] << 6,
-                             in[0], in[0]);
+                                7, 0, 13, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12,
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -731,10 +746,10 @@ inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(3, 0, 9, 0,
                                 15, 2, 0, 8);
-  reg_inls = _mm256_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
-                              in[5], in[4] >> 28 | in[5] << 4,
-                              in[4], in[4],
-                              in[3] >> 21 | in[4] << 11, in[3]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4,
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 3));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -742,10 +757,10 @@ inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(11, 0, 17, 4,
                                 0, 10, 0, 16);
-  reg_inls = _mm256_set_epi32(in[9], in[8] >> 30 | in[9] << 2,
-                              in[8], in[8],
-                              in[7] >> 23 | in[8] << 9, in[7],
-                              in[6] >> 29 | in[7] << 3, in[6]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 9), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2,
+                              SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 6));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -753,10 +768,10 @@ inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(19, 6, 0, 12,
                                 0, 18, 5, 0);
-  reg_inls = _mm256_set_epi32(in[12], in[12],
-                              in[11] >> 25 | in[12] << 7, in[11],
-                              in[10] >> 31 | in[11] << 1, in[10],
-                              in[10], in[9] >> 24 | in[10] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 12), SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 10),
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -767,6 +782,7 @@ inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3fff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -775,11 +791,11 @@ inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
-                               10, 0, 14, 0);
-  reg_inls = _mm256_set_epi32(in[3], in[2] >> 20 | in[3] << 12,
-                             in[2], in[1] >> 24 | in[2] << 8,
-                             in[1], in[0] >> 28 | in[1] << 4,
-                             in[0], in[0]);
+                                10, 0, 14, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8,
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -787,10 +803,10 @@ inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
                                 0, 12, 0, 16);
-  reg_inls = _mm256_set_epi32(in[6], in[6],
-                              in[5] >> 22 | in[6] << 10, in[5],
-                              in[4] >> 26 | in[5] << 6, in[4],
-                              in[3] >> 30 | in[4] << 2, in[3]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 3));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -798,10 +814,10 @@ inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
                                 10, 0, 14, 0);
-  reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
-                              in[9], in[8] >> 24 | in[9] << 8,
-                              in[8], in[7] >> 28 | in[8] << 4,
-                              in[7], in[7]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 10), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12,
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 7));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -809,10 +825,10 @@ inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
                                 0, 12, 0, 16);
-  reg_inls = _mm256_set_epi32(in[13], in[13],
-                              in[12] >> 22 | in[13] << 10, in[12],
-                              in[11] >> 26 | in[12] << 6, in[11],
-                              in[10] >> 30 | in[11] << 2, in[10]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 13), SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 10));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -823,6 +839,7 @@ inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7fff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -831,11 +848,11 @@ inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(9, 0, 11, 0,
-                               13, 0, 15, 0);
-  reg_inls = _mm256_set_epi32(in[3], in[2] >> 26 | in[3] << 6,
-                             in[2], in[1] >> 28 | in[2] << 4,
-                             in[1], in[0] >> 30 | in[1] << 2,
-                             in[0], in[0]);
+                                13, 0, 15, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -843,10 +860,10 @@ inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(1, 0, 3, 0,
                                 5, 0, 7, 0);
-  reg_inls = _mm256_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
-                              in[6], in[5] >> 20 | in[6] << 12,
-                              in[5], in[4] >> 22 | in[5] << 10,
-                              in[4], in[3] >> 24 | in[4] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 7), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14,
+                              SafeLoad(in + 6), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10,
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -854,10 +871,10 @@ inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 10, 0, 12,
                                 0, 14, 0, 16);
-  reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
-                              in[9] >> 27 | in[10] << 5, in[9],
-                              in[8] >> 29 | in[9] << 3, in[8],
-                              in[7] >> 31 | in[8] << 1, in[7]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 9),
+                              SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 7));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -865,10 +882,10 @@ inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(17, 2, 0, 4,
                                 0, 6, 0, 8);
-  reg_inls = _mm256_set_epi32(in[14], in[14],
-                              in[13] >> 19 | in[14] << 13, in[13],
-                              in[12] >> 21 | in[13] << 11, in[12],
-                              in[11] >> 23 | in[12] << 9, in[11]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 14), SafeLoad(in + 14),
+                              SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 11));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -879,6 +896,7 @@ inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -887,11 +905,11 @@ inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
-                               16, 0, 16, 0);
-  reg_inls = _mm256_set_epi32(in[3], in[3],
-                             in[2], in[2],
-                             in[1], in[1],
-                             in[0], in[0]);
+                                16, 0, 16, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -899,10 +917,10 @@ inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
                                 16, 0, 16, 0);
-  reg_inls = _mm256_set_epi32(in[7], in[7],
-                              in[6], in[6],
-                              in[5], in[5],
-                              in[4], in[4]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4), SafeLoad(in + 4));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -910,10 +928,10 @@ inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
                                 16, 0, 16, 0);
-  reg_inls = _mm256_set_epi32(in[11], in[11],
-                              in[10], in[10],
-                              in[9], in[9],
-                              in[8], in[8]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 11), SafeLoad(in + 11),
+                              SafeLoad(in + 10), SafeLoad(in + 10),
+                              SafeLoad(in + 9), SafeLoad(in + 9),
+                              SafeLoad(in + 8), SafeLoad(in + 8));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -921,10 +939,10 @@ inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
                                 16, 0, 16, 0);
-  reg_inls = _mm256_set_epi32(in[15], in[15],
-                              in[14], in[14],
-                              in[13], in[13],
-                              in[12], in[12]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 15), SafeLoad(in + 15),
+                              SafeLoad(in + 14), SafeLoad(in + 14),
+                              SafeLoad(in + 13), SafeLoad(in + 13),
+                              SafeLoad(in + 12), SafeLoad(in + 12));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -935,6 +953,7 @@ inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1ffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -943,11 +962,11 @@ inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 6, 0, 4,
-                               0, 2, 0, 0);
-  reg_inls = _mm256_set_epi32(in[3] >> 23 | in[4] << 9, in[3],
-                             in[2] >> 21 | in[3] << 11, in[2],
-                             in[1] >> 19 | in[2] << 13, in[1],
-                             in[0] >> 17 | in[1] << 15, in[0]);
+                                0, 2, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9, SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -955,10 +974,10 @@ inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 14, 0, 12,
                                 0, 10, 0, 8);
-  reg_inls = _mm256_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
-                              in[6] >> 29 | in[7] << 3, in[6],
-                              in[5] >> 27 | in[6] << 5, in[5],
-                              in[4] >> 25 | in[5] << 7, in[4]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 4));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -966,10 +985,10 @@ inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(7, 0, 5, 0,
                                 3, 0, 1, 0);
-  reg_inls = _mm256_set_epi32(in[12], in[11] >> 22 | in[12] << 10,
-                              in[11], in[10] >> 20 | in[11] << 12,
-                              in[10], in[9] >> 18 | in[10] << 14,
-                              in[9], in[8] >> 16 | in[9] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 12), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10,
+                              SafeLoad(in + 11), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12,
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14,
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -977,10 +996,10 @@ inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(15, 0, 13, 0,
                                 11, 0, 9, 0);
-  reg_inls = _mm256_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
-                              in[15], in[14] >> 28 | in[15] << 4,
-                              in[14], in[13] >> 26 | in[14] << 6,
-                              in[13], in[12] >> 24 | in[13] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 16), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2,
+                              SafeLoad(in + 15), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4,
+                              SafeLoad(in + 14), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6,
+                              SafeLoad(in + 13), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -991,6 +1010,7 @@ inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3ffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -999,11 +1019,11 @@ inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
-                               0, 4, 0, 0);
-  reg_inls = _mm256_set_epi32(in[3] >> 30 | in[4] << 2, in[3],
-                             in[2] >> 26 | in[3] << 6, in[2],
-                             in[1] >> 22 | in[2] << 10, in[1],
-                             in[0] >> 18 | in[1] << 14, in[0]);
+                                0, 4, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1011,10 +1031,10 @@ inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
                                 6, 0, 2, 0);
-  reg_inls = _mm256_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
-                              in[7], in[6] >> 24 | in[7] << 8,
-                              in[6], in[5] >> 20 | in[6] << 12,
-                              in[5], in[4] >> 16 | in[5] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 8), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8,
+                              SafeLoad(in + 6), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1022,10 +1042,10 @@ inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
                                 0, 4, 0, 0);
-  reg_inls = _mm256_set_epi32(in[12] >> 30 | in[13] << 2, in[12],
-                              in[11] >> 26 | in[12] << 6, in[11],
-                              in[10] >> 22 | in[11] << 10, in[10],
-                              in[9] >> 18 | in[10] << 14, in[9]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 9));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1033,10 +1053,10 @@ inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
                                 6, 0, 2, 0);
-  reg_inls = _mm256_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
-                              in[16], in[15] >> 24 | in[16] << 8,
-                              in[15], in[14] >> 20 | in[15] << 12,
-                              in[14], in[13] >> 16 | in[14] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 17), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4,
+                              SafeLoad(in + 16), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8,
+                              SafeLoad(in + 15), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12,
+                              SafeLoad(in + 14), SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1047,6 +1067,7 @@ inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7ffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1055,11 +1076,11 @@ inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(5, 0, 0, 12,
-                               0, 6, 0, 0);
-  reg_inls = _mm256_set_epi32(in[4], in[3] >> 18 | in[4] << 14,
-                             in[2] >> 31 | in[3] << 1, in[2],
-                             in[1] >> 25 | in[2] << 7, in[1],
-                             in[0] >> 19 | in[1] << 13, in[0]);
+                                0, 6, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4), SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14,
+                              SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1067,10 +1088,10 @@ inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 10, 0, 4,
                                 0, 0, 11, 0);
-  reg_inls = _mm256_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
-                              in[7] >> 23 | in[8] << 9, in[7],
-                              in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
-                              in[5], in[4] >> 24 | in[5] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15, SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1078,10 +1099,10 @@ inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
                                 9, 0, 3, 0);
-  reg_inls = _mm256_set_epi32(in[13] >> 21 | in[14] << 11, in[13],
-                              in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
-                              in[11], in[10] >> 22 | in[11] << 10,
-                              in[10], in[9] >> 16 | in[10] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4,
+                              SafeLoad(in + 11), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10,
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1089,10 +1110,10 @@ inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(13, 0, 7, 0,
                                 1, 0, 0, 8);
-  reg_inls = _mm256_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
-                              in[17], in[16] >> 20 | in[17] << 12,
-                              in[16], in[15] >> 14 | in[16] << 18,
-                              in[14] >> 27 | in[15] << 5, in[14]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 18), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6,
+                              SafeLoad(in + 17), SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12,
+                              SafeLoad(in + 16), SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18,
+                              SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 14));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1103,6 +1124,7 @@ inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xfffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1111,11 +1133,11 @@ inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
-                               0, 8, 0, 0);
-  reg_inls = _mm256_set_epi32(in[4], in[3] >> 24 | in[4] << 8,
-                             in[3], in[2] >> 16 | in[3] << 16,
-                             in[1] >> 28 | in[2] << 4, in[1],
-                             in[0] >> 20 | in[1] << 12, in[0]);
+                                0, 8, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16,
+                              SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1123,10 +1145,10 @@ inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
                                 0, 8, 0, 0);
-  reg_inls = _mm256_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
-                              in[8], in[7] >> 16 | in[8] << 16,
-                              in[6] >> 28 | in[7] << 4, in[6],
-                              in[5] >> 20 | in[6] << 12, in[5]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 9), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16,
+                              SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 5));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1134,10 +1156,10 @@ inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
                                 0, 8, 0, 0);
-  reg_inls = _mm256_set_epi32(in[14], in[13] >> 24 | in[14] << 8,
-                              in[13], in[12] >> 16 | in[13] << 16,
-                              in[11] >> 28 | in[12] << 4, in[11],
-                              in[10] >> 20 | in[11] << 12, in[10]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 14), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8,
+                              SafeLoad(in + 13), SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16,
+                              SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 10));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1145,10 +1167,10 @@ inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
                                 0, 8, 0, 0);
-  reg_inls = _mm256_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
-                              in[18], in[17] >> 16 | in[18] << 16,
-                              in[16] >> 28 | in[17] << 4, in[16],
-                              in[15] >> 20 | in[16] << 12, in[15]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 19), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8,
+                              SafeLoad(in + 18), SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16,
+                              SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 16),
+                              SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 15));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1159,6 +1181,7 @@ inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1fffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1167,11 +1190,11 @@ inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 9, 0,
-                               0, 10, 0, 0);
-  reg_inls = _mm256_set_epi32(in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
-                             in[3], in[2] >> 20 | in[3] << 12,
-                             in[1] >> 31 | in[2] << 1, in[1],
-                             in[0] >> 21 | in[1] << 11, in[0]);
+                                0, 10, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13, SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2,
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12,
+                              SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1179,10 +1202,10 @@ inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
                                 7, 0, 0, 8);
-  reg_inls = _mm256_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
-                              in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
-                              in[7], in[6] >> 18 | in[7] << 14,
-                              in[5] >> 29 | in[6] << 3, in[5]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 9),
+                              SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14,
+                              SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1190,10 +1213,10 @@ inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(3, 0, 0, 4,
                                 0, 0, 5, 0);
-  reg_inls = _mm256_set_epi32(in[15], in[14] >> 14 | in[15] << 18,
-                              in[13] >> 25 | in[14] << 7, in[13],
-                              in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
-                              in[11], in[10] >> 16 | in[11] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 15), SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18,
+                              SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6,
+                              SafeLoad(in + 11), SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1201,10 +1224,10 @@ inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(11, 0, 1, 0,
                                 0, 2, 0, 0);
-  reg_inls = _mm256_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
-                              in[19], in[18] >> 12 | in[19] << 20,
-                              in[17] >> 23 | in[18] << 9, in[17],
-                              in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 20), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10,
+                              SafeLoad(in + 19), SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20,
+                              SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 17),
+                              SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1215,6 +1238,7 @@ inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3fffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1223,11 +1247,11 @@ inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
-                               2, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[4] >> 26 | in[5] << 6, in[4],
-                             in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
-                             in[2], in[1] >> 12 | in[2] << 20,
-                             in[0] >> 22 | in[1] << 10, in[0]);
+                                2, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20,
+                              SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1235,10 +1259,10 @@ inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
                                 0, 0, 6, 0);
-  reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
-                              in[8] >> 30 | in[9] << 2, in[8],
-                              in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
-                              in[6], in[5] >> 16 | in[6] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 10), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12,
+                              SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14, SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4,
+                              SafeLoad(in + 6), SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1246,10 +1270,10 @@ inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
                                 2, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[15] >> 26 | in[16] << 6, in[15],
-                              in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
-                              in[13], in[12] >> 12 | in[13] << 20,
-                              in[11] >> 22 | in[12] << 10, in[11]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 15),
+                              SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8,
+                              SafeLoad(in + 13), SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20,
+                              SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 11));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1257,10 +1281,10 @@ inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
                                 0, 0, 6, 0);
-  reg_inls = _mm256_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
-                              in[19] >> 30 | in[20] << 2, in[19],
-                              in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
-                              in[17], in[16] >> 16 | in[17] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 21), SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12,
+                              SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 19),
+                              SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4,
+                              SafeLoad(in + 17), SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1271,6 +1295,7 @@ inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7fffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1279,11 +1304,11 @@ inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
-                               5, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[5], in[4] >> 10 | in[5] << 22,
-                             in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
-                             in[2], in[1] >> 14 | in[2] << 18,
-                             in[0] >> 23 | in[1] << 9, in[0]);
+                                5, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5), SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22,
+                              SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18,
+                              SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1291,10 +1316,10 @@ inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
                                 0, 6, 0, 0);
-  reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
-                              in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
-                              in[7] >> 29 | in[8] << 3, in[7],
-                              in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12,
+                              SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1302,10 +1327,10 @@ inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 3, 0,
                                 0, 0, 7, 0);
-  reg_inls = _mm256_set_epi32(in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
-                              in[15], in[14] >> 12 | in[15] << 20,
-                              in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
-                              in[12], in[11] >> 16 | in[12] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15, SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6,
+                              SafeLoad(in + 15), SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20,
+                              SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2,
+                              SafeLoad(in + 12), SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1313,10 +1338,10 @@ inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(9, 0, 0, 4,
                                 0, 0, 0, 8);
-  reg_inls = _mm256_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
-                              in[20] >> 27 | in[21] << 5, in[20],
-                              in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
-                              in[17] >> 31 | in[18] << 1, in[17]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 22), SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14,
+                              SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 20),
+                              SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10,
+                              SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 17));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1327,6 +1352,7 @@ inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1335,11 +1361,11 @@ inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
-                               8, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[5], in[4] >> 16 | in[5] << 16,
-                             in[3] >> 24 | in[4] << 8, in[3],
-                             in[2], in[1] >> 16 | in[2] << 16,
-                             in[0] >> 24 | in[1] << 8, in[0]);
+                                8, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5), SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16,
+                              SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16,
+                              SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1347,10 +1373,10 @@ inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
                                 8, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
-                              in[9] >> 24 | in[10] << 8, in[9],
-                              in[8], in[7] >> 16 | in[8] << 16,
-                              in[6] >> 24 | in[7] << 8, in[6]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 11), SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16,
+                              SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 9),
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16,
+                              SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 6));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1358,10 +1384,10 @@ inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
                                 8, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[17], in[16] >> 16 | in[17] << 16,
-                              in[15] >> 24 | in[16] << 8, in[15],
-                              in[14], in[13] >> 16 | in[14] << 16,
-                              in[12] >> 24 | in[13] << 8, in[12]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 17), SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16,
+                              SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 15),
+                              SafeLoad(in + 14), SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16,
+                              SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 12));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1369,10 +1395,10 @@ inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
                                 8, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
-                              in[21] >> 24 | in[22] << 8, in[21],
-                              in[20], in[19] >> 16 | in[20] << 16,
-                              in[18] >> 24 | in[19] << 8, in[18]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 23), SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16,
+                              SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 21),
+                              SafeLoad(in + 20), SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16,
+                              SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 18));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1383,6 +1409,7 @@ inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1ffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1391,11 +1418,11 @@ inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
-                               0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
-                             in[3] >> 29 | in[4] << 3, in[3],
-                             in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
-                             in[0] >> 25 | in[1] << 7, in[0]);
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10,
+                              SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14,
+                              SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1403,10 +1430,10 @@ inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 5, 0,
                                 0, 0, 1, 0);
-  reg_inls = _mm256_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
-                              in[10], in[9] >> 12 | in[10] << 20,
-                              in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
-                              in[7], in[6] >> 8 | in[7] << 24);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2,
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20,
+                              SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13, SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6,
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1414,10 +1441,10 @@ inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
                                 0, 2, 0, 0);
-  reg_inls = _mm256_set_epi32(in[17] >> 31 | in[18] << 1, in[17],
-                              in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
-                              in[14] >> 27 | in[15] << 5, in[14],
-                              in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 17),
+                              SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12,
+                              SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 14),
+                              SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1425,10 +1452,10 @@ inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(7, 0, 0, 0,
                                 3, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
-                              in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
-                              in[21], in[20] >> 10 | in[21] << 22,
-                              in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 24), SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18,
+                              SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4,
+                              SafeLoad(in + 21), SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22,
+                              SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1439,6 +1466,7 @@ inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3ffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1447,11 +1475,11 @@ inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
-                               0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
-                             in[4], in[3] >> 8 | in[4] << 24,
-                             in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
-                             in[0] >> 26 | in[1] << 6, in[0]);
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4,
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24,
+                              SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12,
+                              SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1459,10 +1487,10 @@ inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
                                 0, 4, 0, 0);
-  reg_inls = _mm256_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
-                              in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
-                              in[8] >> 30 | in[9] << 2, in[8],
-                              in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 12), SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20,
+                              SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8,
+                              SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1470,10 +1498,10 @@ inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
-                              in[17], in[16] >> 8 | in[17] << 24,
-                              in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
-                              in[13] >> 26 | in[14] << 6, in[13]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4,
+                              SafeLoad(in + 17), SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24,
+                              SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12,
+                              SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 13));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1481,10 +1509,10 @@ inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
                                 0, 4, 0, 0);
-  reg_inls = _mm256_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
-                              in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
-                              in[21] >> 30 | in[22] << 2, in[21],
-                              in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 25), SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20,
+                              SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8,
+                              SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 21),
+                              SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1495,6 +1523,7 @@ inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7ffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1503,11 +1532,11 @@ inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
-                               0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[5] >> 29 | in[6] << 3, in[5],
-                             in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
-                             in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
-                             in[0] >> 27 | in[1] << 5, in[0]);
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20,
+                              SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10,
+                              SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1515,10 +1544,10 @@ inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
-                              in[10] >> 31 | in[11] << 1, in[10],
-                              in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
-                              in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6,
+                              SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18,
+                              SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1526,10 +1555,10 @@ inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
                                 1, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
-                              in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
-                              in[16], in[15] >> 6 | in[16] << 26,
-                              in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14,
+                              SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4,
+                              SafeLoad(in + 16), SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26,
+                              SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1537,10 +1566,10 @@ inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(5, 0, 0, 0,
                                 0, 0, 3, 0);
-  reg_inls = _mm256_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
-                              in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
-                              in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
-                              in[21], in[20] >> 8 | in[21] << 24);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 26), SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22,
+                              SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12,
+                              SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7, SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2,
+                              SafeLoad(in + 21), SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1551,6 +1580,7 @@ inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xfffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1559,11 +1589,11 @@ inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
-                               0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[6], in[5] >> 8 | in[6] << 24,
-                             in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
-                             in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
-                             in[0] >> 28 | in[1] << 4, in[0]);
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6), SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24,
+                              SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16,
+                              SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8,
+                              SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1571,10 +1601,10 @@ inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
-                              in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
-                              in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
-                              in[7] >> 28 | in[8] << 4, in[7]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 13), SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24,
+                              SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16,
+                              SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 7));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1582,10 +1612,10 @@ inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[20], in[19] >> 8 | in[20] << 24,
-                              in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
-                              in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
-                              in[14] >> 28 | in[15] << 4, in[14]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 20), SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24,
+                              SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16,
+                              SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8,
+                              SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 14));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1593,10 +1623,10 @@ inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
-                              in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
-                              in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
-                              in[21] >> 28 | in[22] << 4, in[21]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 27), SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24,
+                              SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16,
+                              SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8,
+                              SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 21));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1607,6 +1637,7 @@ inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1fffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1615,11 +1646,11 @@ inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
-                               0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
-                             in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
-                             in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
-                             in[0] >> 29 | in[1] << 3, in[0]);
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18,
+                              SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12,
+                              SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6,
+                              SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1627,10 +1658,10 @@ inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
                                 0, 2, 0, 0);
-  reg_inls = _mm256_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
-                              in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
-                              in[9] >> 31 | in[10] << 1, in[9],
-                              in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10,
+                              SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4,
+                              SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 9),
+                              SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1638,10 +1669,10 @@ inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 1, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
-                              in[19], in[18] >> 4 | in[19] << 28,
-                              in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
-                              in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2,
+                              SafeLoad(in + 19), SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28,
+                              SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22,
+                              SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1649,10 +1680,10 @@ inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(3, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
-                              in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
-                              in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
-                              in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 28), SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26,
+                              SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20,
+                              SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14,
+                              SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1663,6 +1694,7 @@ inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3fffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1671,11 +1703,11 @@ inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
-                               0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
-                             in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
-                             in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
-                             in[0] >> 30 | in[1] << 2, in[0]);
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12,
+                              SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1683,10 +1715,10 @@ inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
-                              in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
-                              in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
-                              in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 14), SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28,
+                              SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24,
+                              SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20,
+                              SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1694,10 +1726,10 @@ inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
-                              in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
-                              in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
-                              in[15] >> 30 | in[16] << 2, in[15]);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12,
+                              SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8,
+                              SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4,
+                              SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 15));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1705,10 +1737,10 @@ inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
-                              in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
-                              in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
-                              in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 29), SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28,
+                              SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24,
+                              SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20,
+                              SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1719,6 +1751,7 @@ inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out
 }
 
 inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7fffffff;
   __m256i reg_shifts, reg_inls, reg_masks;
   __m256i results;
@@ -1727,11 +1760,11 @@ inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out
 
   // shift the first 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
-                               0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
-                             in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
-                             in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
-                             in[0] >> 31 | in[1] << 1, in[0]);
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6,
+                              SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4,
+                              SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2,
+                              SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 0));
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1739,10 +1772,10 @@ inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out
   // shift the second 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
-                              in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
-                              in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
-                              in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14,
+                              SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12,
+                              SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10,
+                              SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1750,10 +1783,10 @@ inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out
   // shift the third 8 outs
   reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
-                              in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
-                              in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
-                              in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22,
+                              SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20,
+                              SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18,
+                              SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1761,10 +1794,10 @@ inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out
   // shift the last 8 outs
   reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm256_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
-                              in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
-                              in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
-                              in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24);
+  reg_inls = _mm256_set_epi32(SafeLoad(in + 30), SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30,
+                              SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28,
+                              SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26,
+                              SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24);
   results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);
   out += 8;
@@ -1774,7 +1807,7 @@ inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out
   return in;
 }
 
-inline const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) {
+inline static const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) {
   memcpy(out, in, 32 * sizeof(*out));
   in += 32;
   out += 32;
diff --git a/cpp/src/arrow/util/bpacking_avx512_codegen.py b/cpp/src/arrow/util/bpacking_avx512_codegen.py
index 1a56189017d..df4d7d750da 100644
--- a/cpp/src/arrow/util/bpacking_avx512_codegen.py
+++ b/cpp/src/arrow/util/bpacking_avx512_codegen.py
@@ -30,6 +30,7 @@ def print_unpack_bit_func(bit):
 
     print(
         f"inline static const uint32_t* unpack{bit}_32_avx512(const uint32_t* in, uint32_t* out) {bracket}")
+    print("  using ::arrow::util::SafeLoad;")
     print("  uint32_t mask = 0x%x;" % mask)
     print("  __m512i reg_shifts, reg_inls, reg_masks;")
     print("  __m512i results;")
@@ -38,18 +39,18 @@ def print_unpack_bit_func(bit):
     for i in range(32):
         if shift + bit == 32:
             shifts.append(shift)
-            inls.append(f"in[{in_index}]")
+            inls.append(f"SafeLoad(in + {in_index})")
             in_index += 1
             shift = 0
         elif shift + bit > 32:  # cross the boundary
             inls.append(
-                f"in[{in_index}] >> {shift} | in[{in_index + 1}] << {32 - shift}")
+                f"SafeLoad(in + {in_index}) >> {shift} | SafeLoad(in + {in_index + 1}) << {32 - shift}")
             in_index += 1
             shift = bit - (32 - shift)
             shifts.append(0)  # zero shift
         else:
             shifts.append(shift)
-            inls.append(f"in[{in_index}]")
+            inls.append(f"SafeLoad(in + {in_index})")
             shift += bit
 
     print("  reg_masks = _mm512_set1_epi32(mask);")
@@ -165,6 +166,8 @@ def main():
     print("#include <immintrin.h>")
     print("#endif")
     print("")
+    print('#include "arrow/util/ubsan.h"')
+    print("")
     print("namespace arrow {")
     print("namespace internal {")
     print("")
diff --git a/cpp/src/arrow/util/bpacking_avx512_generated.h b/cpp/src/arrow/util/bpacking_avx512_generated.h
index b65bde5561e..fd5db6ecce5 100644
--- a/cpp/src/arrow/util/bpacking_avx512_generated.h
+++ b/cpp/src/arrow/util/bpacking_avx512_generated.h
@@ -28,6 +28,8 @@
 #include <immintrin.h>
 #endif
 
+#include "arrow/util/ubsan.h"
+
 namespace arrow {
 namespace internal {
 
@@ -39,6 +41,7 @@ inline static const uint32_t* unpack0_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack1_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -50,14 +53,14 @@ inline static const uint32_t* unpack1_32_avx512(const uint32_t* in, uint32_t* ou
                                 11, 10, 9, 8,
                                 7, 6, 5, 4,
                                 3, 2, 1, 0);
-  reg_inls = _mm512_set_epi32(in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -67,14 +70,14 @@ inline static const uint32_t* unpack1_32_avx512(const uint32_t* in, uint32_t* ou
                                 27, 26, 25, 24,
                                 23, 22, 21, 20,
                                 19, 18, 17, 16);
-  reg_inls = _mm512_set_epi32(in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -85,6 +88,7 @@ inline static const uint32_t* unpack1_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack2_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -96,14 +100,14 @@ inline static const uint32_t* unpack2_32_avx512(const uint32_t* in, uint32_t* ou
                                 22, 20, 18, 16,
                                 14, 12, 10, 8,
                                 6, 4, 2, 0);
-  reg_inls = _mm512_set_epi32(in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -113,14 +117,14 @@ inline static const uint32_t* unpack2_32_avx512(const uint32_t* in, uint32_t* ou
                                 22, 20, 18, 16,
                                 14, 12, 10, 8,
                                 6, 4, 2, 0);
-  reg_inls = _mm512_set_epi32(in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -131,6 +135,7 @@ inline static const uint32_t* unpack2_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack3_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -142,14 +147,14 @@ inline static const uint32_t* unpack3_32_avx512(const uint32_t* in, uint32_t* ou
                                 1, 0, 27, 24,
                                 21, 18, 15, 12,
                                 9, 6, 3, 0);
-  reg_inls = _mm512_set_epi32(in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[0] >> 30 | in[1] << 2,
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2,
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -159,14 +164,14 @@ inline static const uint32_t* unpack3_32_avx512(const uint32_t* in, uint32_t* ou
                                 17, 14, 11, 8,
                                 5, 2, 0, 28,
                                 25, 22, 19, 16);
-  reg_inls = _mm512_set_epi32(in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[1] >> 31 | in[2] << 1, in[1],
-                              in[1], in[1],
-                              in[1], in[1]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -177,6 +182,7 @@ inline static const uint32_t* unpack3_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack4_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xf;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -188,14 +194,14 @@ inline static const uint32_t* unpack4_32_avx512(const uint32_t* in, uint32_t* ou
                                 12, 8, 4, 0,
                                 28, 24, 20, 16,
                                 12, 8, 4, 0);
-  reg_inls = _mm512_set_epi32(in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -205,14 +211,14 @@ inline static const uint32_t* unpack4_32_avx512(const uint32_t* in, uint32_t* ou
                                 12, 8, 4, 0,
                                 28, 24, 20, 16,
                                 12, 8, 4, 0);
-  reg_inls = _mm512_set_epi32(in[3], in[3],
-                              in[3], in[3],
-                              in[3], in[3],
-                              in[3], in[3],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[2]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -223,6 +229,7 @@ inline static const uint32_t* unpack4_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack5_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1f;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -234,14 +241,14 @@ inline static const uint32_t* unpack5_32_avx512(const uint32_t* in, uint32_t* ou
                                 23, 18, 13, 8,
                                 3, 0, 25, 20,
                                 15, 10, 5, 0);
-  reg_inls = _mm512_set_epi32(in[2], in[2],
-                              in[2], in[1] >> 28 | in[2] << 4,
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[1], in[0] >> 30 | in[1] << 2,
-                              in[0], in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2,
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -251,14 +258,14 @@ inline static const uint32_t* unpack5_32_avx512(const uint32_t* in, uint32_t* ou
                                 7, 2, 0, 24,
                                 19, 14, 9, 4,
                                 0, 26, 21, 16);
-  reg_inls = _mm512_set_epi32(in[4], in[4],
-                              in[4], in[4],
-                              in[4], in[4],
-                              in[3] >> 29 | in[4] << 3, in[3],
-                              in[3], in[3],
-                              in[3], in[3],
-                              in[2] >> 31 | in[3] << 1, in[2],
-                              in[2], in[2]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -269,6 +276,7 @@ inline static const uint32_t* unpack5_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack6_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3f;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -280,14 +288,14 @@ inline static const uint32_t* unpack6_32_avx512(const uint32_t* in, uint32_t* ou
                                 2, 0, 22, 16,
                                 10, 4, 0, 24,
                                 18, 12, 6, 0);
-  reg_inls = _mm512_set_epi32(in[2], in[2],
-                              in[2], in[2],
-                              in[2], in[1] >> 28 | in[2] << 4,
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[0] >> 30 | in[1] << 2, in[0],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -297,14 +305,14 @@ inline static const uint32_t* unpack6_32_avx512(const uint32_t* in, uint32_t* ou
                                 2, 0, 22, 16,
                                 10, 4, 0, 24,
                                 18, 12, 6, 0);
-  reg_inls = _mm512_set_epi32(in[5], in[5],
-                              in[5], in[5],
-                              in[5], in[4] >> 28 | in[5] << 4,
-                              in[4], in[4],
-                              in[4], in[4],
-                              in[3] >> 30 | in[4] << 2, in[3],
-                              in[3], in[3],
-                              in[3], in[3]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4,
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -315,6 +323,7 @@ inline static const uint32_t* unpack6_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack7_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7f;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -326,14 +335,14 @@ inline static const uint32_t* unpack7_32_avx512(const uint32_t* in, uint32_t* ou
                                 13, 6, 0, 24,
                                 17, 10, 3, 0,
                                 21, 14, 7, 0);
-  reg_inls = _mm512_set_epi32(in[3], in[3],
-                              in[2] >> 27 | in[3] << 5, in[2],
-                              in[2], in[2],
-                              in[1] >> 31 | in[2] << 1, in[1],
-                              in[1], in[1],
-                              in[1], in[0] >> 28 | in[1] << 4,
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4,
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -343,14 +352,14 @@ inline static const uint32_t* unpack7_32_avx512(const uint32_t* in, uint32_t* ou
                                 0, 22, 15, 8,
                                 1, 0, 19, 12,
                                 5, 0, 23, 16);
-  reg_inls = _mm512_set_epi32(in[6], in[6],
-                              in[6], in[6],
-                              in[5] >> 29 | in[6] << 3, in[5],
-                              in[5], in[5],
-                              in[5], in[4] >> 26 | in[5] << 6,
-                              in[4], in[4],
-                              in[4], in[3] >> 30 | in[4] << 2,
-                              in[3], in[3]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6,
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2,
+                              SafeLoad(in + 3), SafeLoad(in + 3));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -361,6 +370,7 @@ inline static const uint32_t* unpack7_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack8_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -372,14 +382,14 @@ inline static const uint32_t* unpack8_32_avx512(const uint32_t* in, uint32_t* ou
                                 24, 16, 8, 0,
                                 24, 16, 8, 0,
                                 24, 16, 8, 0);
-  reg_inls = _mm512_set_epi32(in[3], in[3],
-                              in[3], in[3],
-                              in[2], in[2],
-                              in[2], in[2],
-                              in[1], in[1],
-                              in[1], in[1],
-                              in[0], in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0), SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -389,14 +399,14 @@ inline static const uint32_t* unpack8_32_avx512(const uint32_t* in, uint32_t* ou
                                 24, 16, 8, 0,
                                 24, 16, 8, 0,
                                 24, 16, 8, 0);
-  reg_inls = _mm512_set_epi32(in[7], in[7],
-                              in[7], in[7],
-                              in[6], in[6],
-                              in[6], in[6],
-                              in[5], in[5],
-                              in[5], in[5],
-                              in[4], in[4],
-                              in[4], in[4]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 4));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -407,6 +417,7 @@ inline static const uint32_t* unpack8_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack9_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1ff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -418,14 +429,14 @@ inline static const uint32_t* unpack9_32_avx512(const uint32_t* in, uint32_t* ou
                                 3, 0, 17, 8,
                                 0, 22, 13, 4,
                                 0, 18, 9, 0);
-  reg_inls = _mm512_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
-                              in[3], in[3],
-                              in[3], in[2] >> 26 | in[3] << 6,
-                              in[2], in[2],
-                              in[1] >> 31 | in[2] << 1, in[1],
-                              in[1], in[1],
-                              in[0] >> 27 | in[1] << 5, in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 4), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2,
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6,
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -435,14 +446,14 @@ inline static const uint32_t* unpack9_32_avx512(const uint32_t* in, uint32_t* ou
                                 19, 10, 1, 0,
                                 15, 6, 0, 20,
                                 11, 2, 0, 16);
-  reg_inls = _mm512_set_epi32(in[8], in[8],
-                              in[8], in[7] >> 28 | in[8] << 4,
-                              in[7], in[7],
-                              in[7], in[6] >> 24 | in[7] << 8,
-                              in[6], in[6],
-                              in[5] >> 29 | in[6] << 3, in[5],
-                              in[5], in[5],
-                              in[4] >> 25 | in[5] << 7, in[4]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8,
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 4));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -453,6 +464,7 @@ inline static const uint32_t* unpack9_32_avx512(const uint32_t* in, uint32_t* ou
 }
 
 inline static const uint32_t* unpack10_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3ff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -464,14 +476,14 @@ inline static const uint32_t* unpack10_32_avx512(const uint32_t* in, uint32_t* o
                                 14, 4, 0, 16,
                                 6, 0, 18, 8,
                                 0, 20, 10, 0);
-  reg_inls = _mm512_set_epi32(in[4], in[4],
-                              in[4], in[3] >> 24 | in[4] << 8,
-                              in[3], in[3],
-                              in[2] >> 26 | in[3] << 6, in[2],
-                              in[2], in[1] >> 28 | in[2] << 4,
-                              in[1], in[1],
-                              in[0] >> 30 | in[1] << 2, in[0],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 0),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -481,14 +493,14 @@ inline static const uint32_t* unpack10_32_avx512(const uint32_t* in, uint32_t* o
                                 14, 4, 0, 16,
                                 6, 0, 18, 8,
                                 0, 20, 10, 0);
-  reg_inls = _mm512_set_epi32(in[9], in[9],
-                              in[9], in[8] >> 24 | in[9] << 8,
-                              in[8], in[8],
-                              in[7] >> 26 | in[8] << 6, in[7],
-                              in[7], in[6] >> 28 | in[7] << 4,
-                              in[6], in[6],
-                              in[5] >> 30 | in[6] << 2, in[5],
-                              in[5], in[5]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 9), SafeLoad(in + 9),
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4,
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 5),
+                              SafeLoad(in + 5), SafeLoad(in + 5));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -499,6 +511,7 @@ inline static const uint32_t* unpack10_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack11_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7ff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -510,14 +523,14 @@ inline static const uint32_t* unpack11_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 14, 3, 0,
                                 13, 2, 0, 12,
                                 1, 0, 11, 0);
-  reg_inls = _mm512_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
-                              in[4], in[4],
-                              in[3] >> 25 | in[4] << 7, in[3],
-                              in[3], in[2] >> 24 | in[3] << 8,
-                              in[2], in[2],
-                              in[1] >> 23 | in[2] << 9, in[1],
-                              in[1], in[0] >> 22 | in[1] << 10,
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 5), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6,
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7, SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8,
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -527,14 +540,14 @@ inline static const uint32_t* unpack11_32_avx512(const uint32_t* in, uint32_t* o
                                 9, 0, 19, 8,
                                 0, 18, 7, 0,
                                 17, 6, 0, 16);
-  reg_inls = _mm512_set_epi32(in[10], in[10],
-                              in[9] >> 31 | in[10] << 1, in[9],
-                              in[9], in[8] >> 30 | in[9] << 2,
-                              in[8], in[8],
-                              in[7] >> 29 | in[8] << 3, in[7],
-                              in[7], in[6] >> 28 | in[7] << 4,
-                              in[6], in[6],
-                              in[5] >> 27 | in[6] << 5, in[5]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 10), SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 9),
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2,
+                              SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4,
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 5));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -545,6 +558,7 @@ inline static const uint32_t* unpack11_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack12_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xfff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -556,14 +570,14 @@ inline static const uint32_t* unpack12_32_avx512(const uint32_t* in, uint32_t* o
                                 4, 0, 12, 0,
                                 20, 8, 0, 16,
                                 4, 0, 12, 0);
-  reg_inls = _mm512_set_epi32(in[5], in[5],
-                              in[4] >> 28 | in[5] << 4, in[4],
-                              in[4], in[3] >> 24 | in[4] << 8,
-                              in[3], in[3],
-                              in[2], in[2],
-                              in[1] >> 28 | in[2] << 4, in[1],
-                              in[1], in[0] >> 24 | in[1] << 8,
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 4),
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 1),
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -573,14 +587,14 @@ inline static const uint32_t* unpack12_32_avx512(const uint32_t* in, uint32_t* o
                                 4, 0, 12, 0,
                                 20, 8, 0, 16,
                                 4, 0, 12, 0);
-  reg_inls = _mm512_set_epi32(in[11], in[11],
-                              in[10] >> 28 | in[11] << 4, in[10],
-                              in[10], in[9] >> 24 | in[10] << 8,
-                              in[9], in[9],
-                              in[8], in[8],
-                              in[7] >> 28 | in[8] << 4, in[7],
-                              in[7], in[6] >> 24 | in[7] << 8,
-                              in[6], in[6]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 11), SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 10),
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8,
+                              SafeLoad(in + 9), SafeLoad(in + 9),
+                              SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 7),
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8,
+                              SafeLoad(in + 6), SafeLoad(in + 6));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -591,6 +605,7 @@ inline static const uint32_t* unpack12_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack13_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1fff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -602,14 +617,14 @@ inline static const uint32_t* unpack13_32_avx512(const uint32_t* in, uint32_t* o
                                 15, 2, 0, 8,
                                 0, 14, 1, 0,
                                 7, 0, 13, 0);
-  reg_inls = _mm512_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
-                              in[5], in[4] >> 28 | in[5] << 4,
-                              in[4], in[4],
-                              in[3] >> 21 | in[4] << 11, in[3],
-                              in[2] >> 27 | in[3] << 5, in[2],
-                              in[2], in[1] >> 20 | in[2] << 12,
-                              in[1], in[0] >> 26 | in[1] << 6,
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 6), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4,
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 2),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12,
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -619,14 +634,14 @@ inline static const uint32_t* unpack13_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 18, 5, 0,
                                 11, 0, 17, 4,
                                 0, 10, 0, 16);
-  reg_inls = _mm512_set_epi32(in[12], in[12],
-                              in[11] >> 25 | in[12] << 7, in[11],
-                              in[10] >> 31 | in[11] << 1, in[10],
-                              in[10], in[9] >> 24 | in[10] << 8,
-                              in[9], in[8] >> 30 | in[9] << 2,
-                              in[8], in[8],
-                              in[7] >> 23 | in[8] << 9, in[7],
-                              in[6] >> 29 | in[7] << 3, in[6]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 12), SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 10),
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8,
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2,
+                              SafeLoad(in + 8), SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 6));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -637,6 +652,7 @@ inline static const uint32_t* unpack13_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack14_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3fff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -648,14 +664,14 @@ inline static const uint32_t* unpack14_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 12, 0, 16,
                                 2, 0, 6, 0,
                                 10, 0, 14, 0);
-  reg_inls = _mm512_set_epi32(in[6], in[6],
-                              in[5] >> 22 | in[6] << 10, in[5],
-                              in[4] >> 26 | in[5] << 6, in[4],
-                              in[3] >> 30 | in[4] << 2, in[3],
-                              in[3], in[2] >> 20 | in[3] << 12,
-                              in[2], in[1] >> 24 | in[2] << 8,
-                              in[1], in[0] >> 28 | in[1] << 4,
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 3),
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8,
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -665,14 +681,14 @@ inline static const uint32_t* unpack14_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 12, 0, 16,
                                 2, 0, 6, 0,
                                 10, 0, 14, 0);
-  reg_inls = _mm512_set_epi32(in[13], in[13],
-                              in[12] >> 22 | in[13] << 10, in[12],
-                              in[11] >> 26 | in[12] << 6, in[11],
-                              in[10] >> 30 | in[11] << 2, in[10],
-                              in[10], in[9] >> 20 | in[10] << 12,
-                              in[9], in[8] >> 24 | in[9] << 8,
-                              in[8], in[7] >> 28 | in[8] << 4,
-                              in[7], in[7]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 13), SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 10),
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12,
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 7));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -683,6 +699,7 @@ inline static const uint32_t* unpack14_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack15_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7fff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -694,14 +711,14 @@ inline static const uint32_t* unpack15_32_avx512(const uint32_t* in, uint32_t* o
                                 5, 0, 7, 0,
                                 9, 0, 11, 0,
                                 13, 0, 15, 0);
-  reg_inls = _mm512_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
-                              in[6], in[5] >> 20 | in[6] << 12,
-                              in[5], in[4] >> 22 | in[5] << 10,
-                              in[4], in[3] >> 24 | in[4] << 8,
-                              in[3], in[2] >> 26 | in[3] << 6,
-                              in[2], in[1] >> 28 | in[2] << 4,
-                              in[1], in[0] >> 30 | in[1] << 2,
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 7), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14,
+                              SafeLoad(in + 6), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10,
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 1), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2,
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -711,14 +728,14 @@ inline static const uint32_t* unpack15_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 6, 0, 8,
                                 0, 10, 0, 12,
                                 0, 14, 0, 16);
-  reg_inls = _mm512_set_epi32(in[14], in[14],
-                              in[13] >> 19 | in[14] << 13, in[13],
-                              in[12] >> 21 | in[13] << 11, in[12],
-                              in[11] >> 23 | in[12] << 9, in[11],
-                              in[10] >> 25 | in[11] << 7, in[10],
-                              in[9] >> 27 | in[10] << 5, in[9],
-                              in[8] >> 29 | in[9] << 3, in[8],
-                              in[7] >> 31 | in[8] << 1, in[7]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 14), SafeLoad(in + 14),
+                              SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 9),
+                              SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 7));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -729,6 +746,7 @@ inline static const uint32_t* unpack15_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack16_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -740,14 +758,14 @@ inline static const uint32_t* unpack16_32_avx512(const uint32_t* in, uint32_t* o
                                 16, 0, 16, 0,
                                 16, 0, 16, 0,
                                 16, 0, 16, 0);
-  reg_inls = _mm512_set_epi32(in[7], in[7],
-                              in[6], in[6],
-                              in[5], in[5],
-                              in[4], in[4],
-                              in[3], in[3],
-                              in[2], in[2],
-                              in[1], in[1],
-                              in[0], in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 7), SafeLoad(in + 7),
+                              SafeLoad(in + 6), SafeLoad(in + 6),
+                              SafeLoad(in + 5), SafeLoad(in + 5),
+                              SafeLoad(in + 4), SafeLoad(in + 4),
+                              SafeLoad(in + 3), SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 2),
+                              SafeLoad(in + 1), SafeLoad(in + 1),
+                              SafeLoad(in + 0), SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -757,14 +775,14 @@ inline static const uint32_t* unpack16_32_avx512(const uint32_t* in, uint32_t* o
                                 16, 0, 16, 0,
                                 16, 0, 16, 0,
                                 16, 0, 16, 0);
-  reg_inls = _mm512_set_epi32(in[15], in[15],
-                              in[14], in[14],
-                              in[13], in[13],
-                              in[12], in[12],
-                              in[11], in[11],
-                              in[10], in[10],
-                              in[9], in[9],
-                              in[8], in[8]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 15), SafeLoad(in + 15),
+                              SafeLoad(in + 14), SafeLoad(in + 14),
+                              SafeLoad(in + 13), SafeLoad(in + 13),
+                              SafeLoad(in + 12), SafeLoad(in + 12),
+                              SafeLoad(in + 11), SafeLoad(in + 11),
+                              SafeLoad(in + 10), SafeLoad(in + 10),
+                              SafeLoad(in + 9), SafeLoad(in + 9),
+                              SafeLoad(in + 8), SafeLoad(in + 8));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -775,6 +793,7 @@ inline static const uint32_t* unpack16_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack17_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1ffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -786,14 +805,14 @@ inline static const uint32_t* unpack17_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 10, 0, 8,
                                 0, 6, 0, 4,
                                 0, 2, 0, 0);
-  reg_inls = _mm512_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
-                              in[6] >> 29 | in[7] << 3, in[6],
-                              in[5] >> 27 | in[6] << 5, in[5],
-                              in[4] >> 25 | in[5] << 7, in[4],
-                              in[3] >> 23 | in[4] << 9, in[3],
-                              in[2] >> 21 | in[3] << 11, in[2],
-                              in[1] >> 19 | in[2] << 13, in[1],
-                              in[0] >> 17 | in[1] << 15, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9, SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -803,14 +822,14 @@ inline static const uint32_t* unpack17_32_avx512(const uint32_t* in, uint32_t* o
                                 11, 0, 9, 0,
                                 7, 0, 5, 0,
                                 3, 0, 1, 0);
-  reg_inls = _mm512_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
-                              in[15], in[14] >> 28 | in[15] << 4,
-                              in[14], in[13] >> 26 | in[14] << 6,
-                              in[13], in[12] >> 24 | in[13] << 8,
-                              in[12], in[11] >> 22 | in[12] << 10,
-                              in[11], in[10] >> 20 | in[11] << 12,
-                              in[10], in[9] >> 18 | in[10] << 14,
-                              in[9], in[8] >> 16 | in[9] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 16), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2,
+                              SafeLoad(in + 15), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4,
+                              SafeLoad(in + 14), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6,
+                              SafeLoad(in + 13), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8,
+                              SafeLoad(in + 12), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10,
+                              SafeLoad(in + 11), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12,
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14,
+                              SafeLoad(in + 9), SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -821,6 +840,7 @@ inline static const uint32_t* unpack17_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack18_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3ffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -832,14 +852,14 @@ inline static const uint32_t* unpack18_32_avx512(const uint32_t* in, uint32_t* o
                                 6, 0, 2, 0,
                                 0, 12, 0, 8,
                                 0, 4, 0, 0);
-  reg_inls = _mm512_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
-                              in[7], in[6] >> 24 | in[7] << 8,
-                              in[6], in[5] >> 20 | in[6] << 12,
-                              in[5], in[4] >> 16 | in[5] << 16,
-                              in[3] >> 30 | in[4] << 2, in[3],
-                              in[2] >> 26 | in[3] << 6, in[2],
-                              in[1] >> 22 | in[2] << 10, in[1],
-                              in[0] >> 18 | in[1] << 14, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 8), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8,
+                              SafeLoad(in + 6), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16,
+                              SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -849,14 +869,14 @@ inline static const uint32_t* unpack18_32_avx512(const uint32_t* in, uint32_t* o
                                 6, 0, 2, 0,
                                 0, 12, 0, 8,
                                 0, 4, 0, 0);
-  reg_inls = _mm512_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
-                              in[16], in[15] >> 24 | in[16] << 8,
-                              in[15], in[14] >> 20 | in[15] << 12,
-                              in[14], in[13] >> 16 | in[14] << 16,
-                              in[12] >> 30 | in[13] << 2, in[12],
-                              in[11] >> 26 | in[12] << 6, in[11],
-                              in[10] >> 22 | in[11] << 10, in[10],
-                              in[9] >> 18 | in[10] << 14, in[9]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 17), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4,
+                              SafeLoad(in + 16), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8,
+                              SafeLoad(in + 15), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12,
+                              SafeLoad(in + 14), SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16,
+                              SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 12),
+                              SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 9));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -867,6 +887,7 @@ inline static const uint32_t* unpack18_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack19_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7ffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -878,14 +899,14 @@ inline static const uint32_t* unpack19_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 11, 0,
                                 5, 0, 0, 12,
                                 0, 6, 0, 0);
-  reg_inls = _mm512_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
-                              in[7] >> 23 | in[8] << 9, in[7],
-                              in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
-                              in[5], in[4] >> 24 | in[5] << 8,
-                              in[4], in[3] >> 18 | in[4] << 14,
-                              in[2] >> 31 | in[3] << 1, in[2],
-                              in[1] >> 25 | in[2] << 7, in[1],
-                              in[0] >> 19 | in[1] << 13, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15, SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8,
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14,
+                              SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 2),
+                              SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -895,14 +916,14 @@ inline static const uint32_t* unpack19_32_avx512(const uint32_t* in, uint32_t* o
                                 1, 0, 0, 8,
                                 0, 2, 0, 0,
                                 9, 0, 3, 0);
-  reg_inls = _mm512_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
-                              in[17], in[16] >> 20 | in[17] << 12,
-                              in[16], in[15] >> 14 | in[16] << 18,
-                              in[14] >> 27 | in[15] << 5, in[14],
-                              in[13] >> 21 | in[14] << 11, in[13],
-                              in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
-                              in[11], in[10] >> 22 | in[11] << 10,
-                              in[10], in[9] >> 16 | in[10] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 18), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6,
+                              SafeLoad(in + 17), SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12,
+                              SafeLoad(in + 16), SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18,
+                              SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 14),
+                              SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4,
+                              SafeLoad(in + 11), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10,
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -913,6 +934,7 @@ inline static const uint32_t* unpack19_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack20_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xfffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -924,14 +946,14 @@ inline static const uint32_t* unpack20_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 8, 0, 0,
                                 12, 0, 4, 0,
                                 0, 8, 0, 0);
-  reg_inls = _mm512_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
-                              in[8], in[7] >> 16 | in[8] << 16,
-                              in[6] >> 28 | in[7] << 4, in[6],
-                              in[5] >> 20 | in[6] << 12, in[5],
-                              in[4], in[3] >> 24 | in[4] << 8,
-                              in[3], in[2] >> 16 | in[3] << 16,
-                              in[1] >> 28 | in[2] << 4, in[1],
-                              in[0] >> 20 | in[1] << 12, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 9), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16,
+                              SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 6),
+                              SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 5),
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16,
+                              SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -941,14 +963,14 @@ inline static const uint32_t* unpack20_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 8, 0, 0,
                                 12, 0, 4, 0,
                                 0, 8, 0, 0);
-  reg_inls = _mm512_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
-                              in[18], in[17] >> 16 | in[18] << 16,
-                              in[16] >> 28 | in[17] << 4, in[16],
-                              in[15] >> 20 | in[16] << 12, in[15],
-                              in[14], in[13] >> 24 | in[14] << 8,
-                              in[13], in[12] >> 16 | in[13] << 16,
-                              in[11] >> 28 | in[12] << 4, in[11],
-                              in[10] >> 20 | in[11] << 12, in[10]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 19), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8,
+                              SafeLoad(in + 18), SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16,
+                              SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 16),
+                              SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 15),
+                              SafeLoad(in + 14), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8,
+                              SafeLoad(in + 13), SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16,
+                              SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 11),
+                              SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 10));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -959,6 +981,7 @@ inline static const uint32_t* unpack20_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack21_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1fffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -970,14 +993,14 @@ inline static const uint32_t* unpack21_32_avx512(const uint32_t* in, uint32_t* o
                                 7, 0, 0, 8,
                                 0, 0, 9, 0,
                                 0, 10, 0, 0);
-  reg_inls = _mm512_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
-                              in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
-                              in[7], in[6] >> 18 | in[7] << 14,
-                              in[5] >> 29 | in[6] << 3, in[5],
-                              in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
-                              in[3], in[2] >> 20 | in[3] << 12,
-                              in[1] >> 31 | in[2] << 1, in[1],
-                              in[0] >> 21 | in[1] << 11, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 9),
+                              SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4,
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14,
+                              SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13, SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2,
+                              SafeLoad(in + 3), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12,
+                              SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 1),
+                              SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -987,14 +1010,14 @@ inline static const uint32_t* unpack21_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 2, 0, 0,
                                 3, 0, 0, 4,
                                 0, 0, 5, 0);
-  reg_inls = _mm512_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
-                              in[19], in[18] >> 12 | in[19] << 20,
-                              in[17] >> 23 | in[18] << 9, in[17],
-                              in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8,
-                              in[15], in[14] >> 14 | in[15] << 18,
-                              in[13] >> 25 | in[14] << 7, in[13],
-                              in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
-                              in[11], in[10] >> 16 | in[11] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 20), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10,
+                              SafeLoad(in + 19), SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20,
+                              SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 17),
+                              SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8,
+                              SafeLoad(in + 15), SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18,
+                              SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 13),
+                              SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6,
+                              SafeLoad(in + 11), SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1005,6 +1028,7 @@ inline static const uint32_t* unpack21_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack22_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3fffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1016,14 +1040,14 @@ inline static const uint32_t* unpack22_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 6, 0,
                                 0, 4, 0, 0,
                                 2, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
-                              in[8] >> 30 | in[9] << 2, in[8],
-                              in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
-                              in[6], in[5] >> 16 | in[6] << 16,
-                              in[4] >> 26 | in[5] << 6, in[4],
-                              in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
-                              in[2], in[1] >> 12 | in[2] << 20,
-                              in[0] >> 22 | in[1] << 10, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 10), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12,
+                              SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14, SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4,
+                              SafeLoad(in + 6), SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16,
+                              SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 4),
+                              SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20,
+                              SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1033,14 +1057,14 @@ inline static const uint32_t* unpack22_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 6, 0,
                                 0, 4, 0, 0,
                                 2, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
-                              in[19] >> 30 | in[20] << 2, in[19],
-                              in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
-                              in[17], in[16] >> 16 | in[17] << 16,
-                              in[15] >> 26 | in[16] << 6, in[15],
-                              in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
-                              in[13], in[12] >> 12 | in[13] << 20,
-                              in[11] >> 22 | in[12] << 10, in[11]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 21), SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12,
+                              SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 19),
+                              SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4,
+                              SafeLoad(in + 17), SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16,
+                              SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 15),
+                              SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8,
+                              SafeLoad(in + 13), SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20,
+                              SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 11));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1051,6 +1075,7 @@ inline static const uint32_t* unpack22_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack23_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7fffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1062,14 +1087,14 @@ inline static const uint32_t* unpack23_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 6, 0, 0,
                                 1, 0, 0, 0,
                                 5, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
-                              in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
-                              in[7] >> 29 | in[8] << 3, in[7],
-                              in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8,
-                              in[5], in[4] >> 10 | in[5] << 22,
-                              in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
-                              in[2], in[1] >> 14 | in[2] << 18,
-                              in[0] >> 23 | in[1] << 9, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12,
+                              SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 7),
+                              SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8,
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22,
+                              SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4,
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18,
+                              SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1079,14 +1104,14 @@ inline static const uint32_t* unpack23_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 8,
                                 0, 0, 3, 0,
                                 0, 0, 7, 0);
-  reg_inls = _mm512_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
-                              in[20] >> 27 | in[21] << 5, in[20],
-                              in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
-                              in[17] >> 31 | in[18] << 1, in[17],
-                              in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
-                              in[15], in[14] >> 12 | in[15] << 20,
-                              in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
-                              in[12], in[11] >> 16 | in[12] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 22), SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14,
+                              SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 20),
+                              SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10,
+                              SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 17),
+                              SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15, SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6,
+                              SafeLoad(in + 15), SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20,
+                              SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2,
+                              SafeLoad(in + 12), SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1097,6 +1122,7 @@ inline static const uint32_t* unpack23_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack24_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1108,14 +1134,14 @@ inline static const uint32_t* unpack24_32_avx512(const uint32_t* in, uint32_t* o
                                 8, 0, 0, 0,
                                 8, 0, 0, 0,
                                 8, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
-                              in[9] >> 24 | in[10] << 8, in[9],
-                              in[8], in[7] >> 16 | in[8] << 16,
-                              in[6] >> 24 | in[7] << 8, in[6],
-                              in[5], in[4] >> 16 | in[5] << 16,
-                              in[3] >> 24 | in[4] << 8, in[3],
-                              in[2], in[1] >> 16 | in[2] << 16,
-                              in[0] >> 24 | in[1] << 8, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 11), SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16,
+                              SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 9),
+                              SafeLoad(in + 8), SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16,
+                              SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 6),
+                              SafeLoad(in + 5), SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16,
+                              SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 3),
+                              SafeLoad(in + 2), SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16,
+                              SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1125,14 +1151,14 @@ inline static const uint32_t* unpack24_32_avx512(const uint32_t* in, uint32_t* o
                                 8, 0, 0, 0,
                                 8, 0, 0, 0,
                                 8, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
-                              in[21] >> 24 | in[22] << 8, in[21],
-                              in[20], in[19] >> 16 | in[20] << 16,
-                              in[18] >> 24 | in[19] << 8, in[18],
-                              in[17], in[16] >> 16 | in[17] << 16,
-                              in[15] >> 24 | in[16] << 8, in[15],
-                              in[14], in[13] >> 16 | in[14] << 16,
-                              in[12] >> 24 | in[13] << 8, in[12]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 23), SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16,
+                              SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 21),
+                              SafeLoad(in + 20), SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16,
+                              SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 18),
+                              SafeLoad(in + 17), SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16,
+                              SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 15),
+                              SafeLoad(in + 14), SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16,
+                              SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 12));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1143,6 +1169,7 @@ inline static const uint32_t* unpack24_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack25_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1ffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1154,14 +1181,14 @@ inline static const uint32_t* unpack25_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 1, 0,
                                 0, 0, 0, 4,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
-                              in[10], in[9] >> 12 | in[10] << 20,
-                              in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
-                              in[7], in[6] >> 8 | in[7] << 24,
-                              in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
-                              in[3] >> 29 | in[4] << 3, in[3],
-                              in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
-                              in[0] >> 25 | in[1] << 7, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2,
+                              SafeLoad(in + 10), SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20,
+                              SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13, SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6,
+                              SafeLoad(in + 7), SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24,
+                              SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10,
+                              SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 3),
+                              SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14,
+                              SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1171,14 +1198,14 @@ inline static const uint32_t* unpack25_32_avx512(const uint32_t* in, uint32_t* o
                                 3, 0, 0, 0,
                                 0, 6, 0, 0,
                                 0, 2, 0, 0);
-  reg_inls = _mm512_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
-                              in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
-                              in[21], in[20] >> 10 | in[21] << 22,
-                              in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8,
-                              in[17] >> 31 | in[18] << 1, in[17],
-                              in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
-                              in[14] >> 27 | in[15] << 5, in[14],
-                              in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 24), SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18,
+                              SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4,
+                              SafeLoad(in + 21), SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22,
+                              SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8,
+                              SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 17),
+                              SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12,
+                              SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 14),
+                              SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1189,6 +1216,7 @@ inline static const uint32_t* unpack25_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack26_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3ffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1200,14 +1228,14 @@ inline static const uint32_t* unpack26_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 4, 0, 0,
                                 0, 0, 2, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
-                              in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
-                              in[8] >> 30 | in[9] << 2, in[8],
-                              in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16,
-                              in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
-                              in[4], in[3] >> 8 | in[4] << 24,
-                              in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
-                              in[0] >> 26 | in[1] << 6, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 12), SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20,
+                              SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8,
+                              SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 8),
+                              SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16,
+                              SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4,
+                              SafeLoad(in + 4), SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24,
+                              SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12,
+                              SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1217,14 +1245,14 @@ inline static const uint32_t* unpack26_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 4, 0, 0,
                                 0, 0, 2, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
-                              in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
-                              in[21] >> 30 | in[22] << 2, in[21],
-                              in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16,
-                              in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
-                              in[17], in[16] >> 8 | in[17] << 24,
-                              in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
-                              in[13] >> 26 | in[14] << 6, in[13]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 25), SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20,
+                              SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8,
+                              SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 21),
+                              SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16,
+                              SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4,
+                              SafeLoad(in + 17), SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24,
+                              SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12,
+                              SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 13));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1235,6 +1263,7 @@ inline static const uint32_t* unpack26_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack27_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7ffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1246,14 +1275,14 @@ inline static const uint32_t* unpack27_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 0, 2, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
-                              in[10] >> 31 | in[11] << 1, in[10],
-                              in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
-                              in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8,
-                              in[5] >> 29 | in[6] << 3, in[5],
-                              in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
-                              in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
-                              in[0] >> 27 | in[1] << 5, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6,
+                              SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 10),
+                              SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18,
+                              SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8,
+                              SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 5),
+                              SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20,
+                              SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10,
+                              SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1263,14 +1292,14 @@ inline static const uint32_t* unpack27_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 3, 0,
                                 0, 0, 0, 0,
                                 1, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
-                              in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
-                              in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
-                              in[21], in[20] >> 8 | in[21] << 24,
-                              in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
-                              in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
-                              in[16], in[15] >> 6 | in[16] << 26,
-                              in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 26), SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22,
+                              SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12,
+                              SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7, SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2,
+                              SafeLoad(in + 21), SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24,
+                              SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14,
+                              SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4,
+                              SafeLoad(in + 16), SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26,
+                              SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1281,6 +1310,7 @@ inline static const uint32_t* unpack27_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack28_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0xfffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1292,14 +1322,14 @@ inline static const uint32_t* unpack28_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 4, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
-                              in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
-                              in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
-                              in[7] >> 28 | in[8] << 4, in[7],
-                              in[6], in[5] >> 8 | in[6] << 24,
-                              in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
-                              in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
-                              in[0] >> 28 | in[1] << 4, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 13), SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24,
+                              SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16,
+                              SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8,
+                              SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 7),
+                              SafeLoad(in + 6), SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24,
+                              SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16,
+                              SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8,
+                              SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1309,14 +1339,14 @@ inline static const uint32_t* unpack28_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 4, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
-                              in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
-                              in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
-                              in[21] >> 28 | in[22] << 4, in[21],
-                              in[20], in[19] >> 8 | in[20] << 24,
-                              in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
-                              in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
-                              in[14] >> 28 | in[15] << 4, in[14]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 27), SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24,
+                              SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16,
+                              SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8,
+                              SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 21),
+                              SafeLoad(in + 20), SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24,
+                              SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16,
+                              SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8,
+                              SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 14));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1327,6 +1357,7 @@ inline static const uint32_t* unpack28_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack29_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x1fffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1338,14 +1369,14 @@ inline static const uint32_t* unpack29_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 2, 0, 0,
                                 0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
-                              in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
-                              in[9] >> 31 | in[10] << 1, in[9],
-                              in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24,
-                              in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
-                              in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
-                              in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
-                              in[0] >> 29 | in[1] << 3, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10,
+                              SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4,
+                              SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 9),
+                              SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24,
+                              SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18,
+                              SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12,
+                              SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6,
+                              SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1355,14 +1386,14 @@ inline static const uint32_t* unpack29_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 0, 0, 1, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
-                              in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
-                              in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
-                              in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8,
-                              in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
-                              in[19], in[18] >> 4 | in[19] << 28,
-                              in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
-                              in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 28), SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26,
+                              SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20,
+                              SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14,
+                              SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8,
+                              SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2,
+                              SafeLoad(in + 19), SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28,
+                              SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22,
+                              SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1373,6 +1404,7 @@ inline static const uint32_t* unpack29_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack30_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x3fffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1384,14 +1416,14 @@ inline static const uint32_t* unpack30_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
-                              in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
-                              in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
-                              in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16,
-                              in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
-                              in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
-                              in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
-                              in[0] >> 30 | in[1] << 2, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 14), SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28,
+                              SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24,
+                              SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20,
+                              SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16,
+                              SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12,
+                              SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8,
+                              SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4,
+                              SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1401,14 +1433,14 @@ inline static const uint32_t* unpack30_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
-                              in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
-                              in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
-                              in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16,
-                              in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
-                              in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
-                              in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
-                              in[15] >> 30 | in[16] << 2, in[15]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 29), SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28,
+                              SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24,
+                              SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20,
+                              SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16,
+                              SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12,
+                              SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8,
+                              SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4,
+                              SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 15));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1419,6 +1451,7 @@ inline static const uint32_t* unpack30_32_avx512(const uint32_t* in, uint32_t* o
 }
 
 inline static const uint32_t* unpack31_32_avx512(const uint32_t* in, uint32_t* out) {
+  using ::arrow::util::SafeLoad;
   uint32_t mask = 0x7fffffff;
   __m512i reg_shifts, reg_inls, reg_masks;
   __m512i results;
@@ -1430,14 +1463,14 @@ inline static const uint32_t* unpack31_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
-                              in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
-                              in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
-                              in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8,
-                              in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
-                              in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
-                              in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
-                              in[0] >> 31 | in[1] << 1, in[0]);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14,
+                              SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12,
+                              SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10,
+                              SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8,
+                              SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6,
+                              SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4,
+                              SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2,
+                              SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 0));
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
@@ -1447,14 +1480,14 @@ inline static const uint32_t* unpack31_32_avx512(const uint32_t* in, uint32_t* o
                                 0, 0, 0, 0,
                                 0, 0, 0, 0,
                                 0, 0, 0, 0);
-  reg_inls = _mm512_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
-                              in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
-                              in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
-                              in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24,
-                              in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
-                              in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
-                              in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
-                              in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
+  reg_inls = _mm512_set_epi32(SafeLoad(in + 30), SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30,
+                              SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28,
+                              SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26,
+                              SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24,
+                              SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22,
+                              SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20,
+                              SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18,
+                              SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16);
   results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   _mm512_storeu_si512(out, results);
   out += 16;
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index c683e198cd6..7aefd1ab9cd 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -94,6 +94,47 @@ static constexpr double kDoublePowersOfTen[2 * 38 + 1] = {
     1e17,  1e18,  1e19,  1e20,  1e21,  1e22,  1e23,  1e24,  1e25,  1e26,  1e27,
     1e28,  1e29,  1e30,  1e31,  1e32,  1e33,  1e34,  1e35,  1e36,  1e37,  1e38};
 
+// On the Windows R toolchain, INFINITY is double type instead of float
+static constexpr float kFloatInf = std::numeric_limits<float>::infinity();
+static constexpr float kFloatPowersOfTen76[2 * 76 + 1] = {
+    0,         0,         0,         0,         0,         0,         0,
+    0,         0,         0,         0,         0,         0,         0,
+    0,         0,         0,         0,         0,         0,         0,
+    0,         0,         0,         0,         0,         0,         0,
+    0,         0,         0,         1e-45f,    1e-44f,    1e-43f,    1e-42f,
+    1e-41f,    1e-40f,    1e-39f,    1e-38f,    1e-37f,    1e-36f,    1e-35f,
+    1e-34f,    1e-33f,    1e-32f,    1e-31f,    1e-30f,    1e-29f,    1e-28f,
+    1e-27f,    1e-26f,    1e-25f,    1e-24f,    1e-23f,    1e-22f,    1e-21f,
+    1e-20f,    1e-19f,    1e-18f,    1e-17f,    1e-16f,    1e-15f,    1e-14f,
+    1e-13f,    1e-12f,    1e-11f,    1e-10f,    1e-9f,     1e-8f,     1e-7f,
+    1e-6f,     1e-5f,     1e-4f,     1e-3f,     1e-2f,     1e-1f,     1e0f,
+    1e1f,      1e2f,      1e3f,      1e4f,      1e5f,      1e6f,      1e7f,
+    1e8f,      1e9f,      1e10f,     1e11f,     1e12f,     1e13f,     1e14f,
+    1e15f,     1e16f,     1e17f,     1e18f,     1e19f,     1e20f,     1e21f,
+    1e22f,     1e23f,     1e24f,     1e25f,     1e26f,     1e27f,     1e28f,
+    1e29f,     1e30f,     1e31f,     1e32f,     1e33f,     1e34f,     1e35f,
+    1e36f,     1e37f,     1e38f,     kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+    kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+    kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+    kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+    kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+    kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf};
+
+static constexpr double kDoublePowersOfTen76[2 * 76 + 1] = {
+    1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65,
+    1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53,
+    1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41,
+    1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29,
+    1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
+    1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9,  1e-8,  1e-7,  1e-6,  1e-5,
+    1e-4,  1e-3,  1e-2,  1e-1,  1e0,   1e1,   1e2,   1e3,   1e4,   1e5,   1e6,   1e7,
+    1e8,   1e9,   1e10,  1e11,  1e12,  1e13,  1e14,  1e15,  1e16,  1e17,  1e18,  1e19,
+    1e20,  1e21,  1e22,  1e23,  1e24,  1e25,  1e26,  1e27,  1e28,  1e29,  1e30,  1e31,
+    1e32,  1e33,  1e34,  1e35,  1e36,  1e37,  1e38,  1e39,  1e40,  1e41,  1e42,  1e43,
+    1e44,  1e45,  1e46,  1e47,  1e48,  1e49,  1e50,  1e51,  1e52,  1e53,  1e54,  1e55,
+    1e56,  1e57,  1e58,  1e59,  1e60,  1e61,  1e62,  1e63,  1e64,  1e65,  1e66,  1e67,
+    1e68,  1e69,  1e70,  1e71,  1e72,  1e73,  1e74,  1e75,  1e76};
+
 namespace {
 
 template <typename Real, typename Derived>
@@ -764,6 +805,126 @@ Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const {
   return arrow::ToArrowStatus(dstatus, 256);
 }
 
+namespace {
+
+template <typename Real, typename Derived>
+struct Decimal256RealConversion {
+  static Result<Decimal256> FromPositiveReal(Real real, int32_t precision,
+                                             int32_t scale) {
+    auto x = real;
+    if (scale >= -76 && scale <= 76) {
+      x *= Derived::powers_of_ten()[scale + 76];
+    } else {
+      x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
+    }
+    x = std::nearbyint(x);
+    const auto max_abs = Derived::powers_of_ten()[precision + 76];
+    if (x >= max_abs) {
+      return Status::Invalid("Cannot convert ", real,
+                             " to Decimal256(precision = ", precision,
+                             ", scale = ", scale, "): overflow");
+    }
+    // Extract parts
+    const auto part3 = std::floor(std::ldexp(x, -192));
+    x -= std::ldexp(part3, 192);
+    const auto part2 = std::floor(std::ldexp(x, -128));
+    x -= std::ldexp(part2, 128);
+    const auto part1 = std::floor(std::ldexp(x, -64));
+    x -= std::ldexp(part1, 64);
+    const auto part0 = x;
+
+    DCHECK_GE(part3, 0);
+    DCHECK_LT(part3, 1.8446744073709552e+19);  // 2**64
+    DCHECK_GE(part2, 0);
+    DCHECK_LT(part2, 1.8446744073709552e+19);  // 2**64
+    DCHECK_GE(part1, 0);
+    DCHECK_LT(part1, 1.8446744073709552e+19);  // 2**64
+    DCHECK_GE(part0, 0);
+    DCHECK_LT(part0, 1.8446744073709552e+19);  // 2**64
+    return Decimal256(std::array<uint64_t, 4>{
+        static_cast<uint64_t>(part0), static_cast<uint64_t>(part1),
+        static_cast<uint64_t>(part2), static_cast<uint64_t>(part3)});
+  }
+
+  static Result<Decimal256> FromReal(Real x, int32_t precision, int32_t scale) {
+    DCHECK_GT(precision, 0);
+    DCHECK_LE(precision, 76);
+
+    if (!std::isfinite(x)) {
+      return Status::Invalid("Cannot convert ", x, " to Decimal256");
+    }
+    if (x < 0) {
+      ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
+      return dec.Negate();
+    } else {
+      // Includes negative zero
+      return FromPositiveReal(x, precision, scale);
+    }
+  }
+
+  static Real ToRealPositive(const Decimal256& decimal, int32_t scale) {
+    DCHECK_GE(decimal, 0);
+    Real x = 0;
+    const auto& parts = decimal.little_endian_array();
+    x += Derived::two_to_192(static_cast<Real>(parts[3]));
+    x += Derived::two_to_128(static_cast<Real>(parts[2]));
+    x += Derived::two_to_64(static_cast<Real>(parts[1]));
+    x += static_cast<Real>(parts[0]);
+    if (scale >= -76 && scale <= 76) {
+      x *= Derived::powers_of_ten()[-scale + 76];
+    } else {
+      x *= std::pow(static_cast<Real>(10), static_cast<Real>(-scale));
+    }
+    return x;
+  }
+
+  static Real ToReal(Decimal256 decimal, int32_t scale) {
+    if (decimal.little_endian_array()[3] & (1ULL << 63)) {
+      // Convert the absolute value to avoid precision loss
+      decimal.Negate();
+      return -ToRealPositive(decimal, scale);
+    } else {
+      return ToRealPositive(decimal, scale);
+    }
+  }
+};
+
+struct Decimal256FloatConversion
+    : public Decimal256RealConversion<float, Decimal256FloatConversion> {
+  static constexpr const float* powers_of_ten() { return kFloatPowersOfTen76; }
+
+  static float two_to_64(float x) { return x * 1.8446744e+19f; }
+  static float two_to_128(float x) { return x == 0 ? 0 : INFINITY; }
+  static float two_to_192(float x) { return x == 0 ? 0 : INFINITY; }
+};
+
+struct Decimal256DoubleConversion
+    : public Decimal256RealConversion<double, Decimal256DoubleConversion> {
+  static constexpr const double* powers_of_ten() { return kDoublePowersOfTen76; }
+
+  static double two_to_64(double x) { return x * 1.8446744073709552e+19; }
+  static double two_to_128(double x) { return x * 3.402823669209385e+38; }
+  static double two_to_192(double x) { return x * 6.277101735386681e+57; }
+};
+
+}  // namespace
+
+Result<Decimal256> Decimal256::FromReal(float x, int32_t precision, int32_t scale) {
+  return Decimal256FloatConversion::FromReal(x, precision, scale);
+}
+
+Result<Decimal256> Decimal256::FromReal(double x, int32_t precision, int32_t scale) {
+  return Decimal256DoubleConversion::FromReal(x, precision, scale);
+}
+
+float Decimal256::ToFloat(int32_t scale) const {
+  return Decimal256FloatConversion::ToReal(*this, scale);
+}
+
+double Decimal256::ToDouble(int32_t scale) const {
+  return Decimal256DoubleConversion::ToReal(*this, scale);
+}
+
 std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) {
   os << decimal.ToIntegerString();
   return os;
diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h
index b2d6f097da6..4a158728833 100644
--- a/cpp/src/arrow/util/decimal.h
+++ b/cpp/src/arrow/util/decimal.h
@@ -250,12 +250,42 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
   /// \return error status if the length is an invalid value
   static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
 
+  static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
+  static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
+
+  /// \brief Convert to a floating-point number (scaled).
+  /// May return infinity in case of overflow.
+  float ToFloat(int32_t scale) const;
+  /// \brief Convert to a floating-point number (scaled)
+  double ToDouble(int32_t scale) const;
+
+  /// \brief Convert to a floating-point number (scaled)
+  template <typename T>
+  T ToReal(int32_t scale) const {
+    return ToRealConversion<T>::ToReal(*this, scale);
+  }
+
   friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
                                                const Decimal256& decimal);
 
  private:
   /// Converts internal error code to Status
   Status ToArrowStatus(DecimalStatus dstatus) const;
+
+  template <typename T>
+  struct ToRealConversion {};
+};
+
+template <>
+struct Decimal256::ToRealConversion<float> {
+  static float ToReal(const Decimal256& dec, int32_t scale) { return dec.ToFloat(scale); }
+};
+
+template <>
+struct Decimal256::ToRealConversion<double> {
+  static double ToReal(const Decimal256& dec, int32_t scale) {
+    return dec.ToDouble(scale);
+  }
 };
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc
index 0bc838d0c29..fdcbd945e66 100644
--- a/cpp/src/arrow/util/decimal_test.cc
+++ b/cpp/src/arrow/util/decimal_test.cc
@@ -23,6 +23,7 @@
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -454,17 +455,17 @@ TEST(Decimal128ParseTest, WithExponentAndNullptrScale) {
   ASSERT_OK_AND_EQ(expected_value, Decimal128::FromString("1.23E-8"));
 }
 
-template <typename Real>
+template <typename Decimal, typename Real>
 void CheckDecimalFromReal(Real real, int32_t precision, int32_t scale,
                           const std::string& expected) {
-  ASSERT_OK_AND_ASSIGN(auto dec, Decimal128::FromReal(real, precision, scale));
+  ASSERT_OK_AND_ASSIGN(auto dec, Decimal::FromReal(real, precision, scale));
   ASSERT_EQ(dec.ToString(scale), expected);
 }
 
-template <typename Real>
+template <typename Decimal, typename Real>
 void CheckDecimalFromRealIntegerString(Real real, int32_t precision, int32_t scale,
                                        const std::string& expected) {
-  ASSERT_OK_AND_ASSIGN(auto dec, Decimal128::FromReal(real, precision, scale));
+  ASSERT_OK_AND_ASSIGN(auto dec, Decimal::FromReal(real, precision, scale));
   ASSERT_EQ(dec.ToIntegerString(), expected);
 }
 
@@ -485,11 +486,13 @@ struct FromRealTestParam {
 using FromFloatTestParam = FromRealTestParam<float>;
 using FromDoubleTestParam = FromRealTestParam<double>;
 
-// Common tests for Decimal128::FromReal(T, ...)
+// Common tests for Decimal128::FromReal(T, ...) and Decimal256::FromReal(T, ...)
 template <typename T>
 class TestDecimalFromReal : public ::testing::Test {
  public:
-  using ParamType = FromRealTestParam<T>;
+  using Decimal = typename T::first_type;
+  using Real = typename T::second_type;
+  using ParamType = FromRealTestParam<Real>;
 
   void TestSuccess() {
     const std::vector<ParamType> params{
@@ -526,128 +529,157 @@ class TestDecimalFromReal : public ::testing::Test {
         // clang-format on
     };
     for (const ParamType& param : params) {
-      CheckDecimalFromReal(param.real, param.precision, param.scale, param.expected);
+      CheckDecimalFromReal<Decimal>(param.real, param.precision, param.scale,
+                                    param.expected);
     }
   }
 
   void TestErrors() {
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(INFINITY, 19, 4));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(-INFINITY, 19, 4));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(NAN, 19, 4));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(INFINITY, 19, 4));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(-INFINITY, 19, 4));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(NAN, 19, 4));
     // Overflows
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(1000.0, 3, 0));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(-1000.0, 3, 0));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(1000.0, 5, 2));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(-1000.0, 5, 2));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(999.996, 5, 2));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(-999.996, 5, 2));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(1e+38, 38, 0));
-    ASSERT_RAISES(Invalid, Decimal128::FromReal(-1e+38, 38, 0));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(1000.0, 3, 0));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(-1000.0, 3, 0));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(1000.0, 5, 2));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(-1000.0, 5, 2));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(999.996, 5, 2));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(-999.996, 5, 2));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(1e+38, 38, 0));
+    ASSERT_RAISES(Invalid, Decimal::FromReal(-1e+38, 38, 0));
   }
 };
 
-using RealTypes = ::testing::Types<float, double>;
+using RealTypes =
+    ::testing::Types<std::pair<Decimal128, float>, std::pair<Decimal128, double>,
+                     std::pair<Decimal256, float>, std::pair<Decimal256, double>>;
 TYPED_TEST_SUITE(TestDecimalFromReal, RealTypes);
 
 TYPED_TEST(TestDecimalFromReal, TestSuccess) { this->TestSuccess(); }
 
 TYPED_TEST(TestDecimalFromReal, TestErrors) { this->TestErrors(); }
 
-// Custom test for Decimal128::FromReal(float, ...)
-class TestDecimalFromRealFloat : public ::testing::TestWithParam<FromFloatTestParam> {};
-
-TEST_P(TestDecimalFromRealFloat, SuccessConversion) {
-  const auto param = GetParam();
-  CheckDecimalFromReal(param.real, param.precision, param.scale, param.expected);
-}
-
-// clang-format off
-INSTANTIATE_TEST_SUITE_P(
-    TestDecimalFromRealFloat, TestDecimalFromRealFloat,
-    ::testing::Values(
-        // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision)
-        FromFloatTestParam{9.223373e+18f, 19, 0, "9223373136366403584"},
-        FromFloatTestParam{-9.223373e+18f, 19, 0, "-9223373136366403584"},
-        FromFloatTestParam{9.223373e+14f, 19, 4, "922337313636640.3584"},
-        FromFloatTestParam{-9.223373e+14f, 19, 4, "-922337313636640.3584"},
-        // 2**64 - 2**40 (exactly representable in a float)
-        FromFloatTestParam{1.8446743e+19f, 20, 0, "18446742974197923840"},
-        FromFloatTestParam{-1.8446743e+19f, 20, 0, "-18446742974197923840"},
-        // 2**64 + 2**41 (exactly representable in a float)
-        FromFloatTestParam{1.8446746e+19f, 20, 0, "18446746272732807168"},
-        FromFloatTestParam{-1.8446746e+19f, 20, 0, "-18446746272732807168"},
-        FromFloatTestParam{1.8446746e+15f, 20, 4, "1844674627273280.7168"},
-        FromFloatTestParam{-1.8446746e+15f, 20, 4, "-1844674627273280.7168"},
-        // Almost 10**38 (minus 2**103)
-        FromFloatTestParam{9.999999e+37f, 38, 0,
-                           "99999986661652122824821048795547566080"},
-        FromFloatTestParam{-9.999999e+37f, 38, 0,
-                           "-99999986661652122824821048795547566080"}
-));
-// clang-format on
-
-TEST(TestDecimalFromRealFloat, LargeValues) {
+using DecimalTypes = ::testing::Types<Decimal128, Decimal256>;
+
+// Tests for Decimal128::FromReal(float, ...) and Decimal256::FromReal(float, ...)
+template <typename T>
+class TestDecimalFromRealFloat : public ::testing::Test {
+ protected:
+  std::vector<FromFloatTestParam> GetValues() {
+    return {// 2**63 + 2**40 (exactly representable in a float's 24 bits of precision)
+            FromFloatTestParam{9.223373e+18f, 19, 0, "9223373136366403584"},
+            FromFloatTestParam{-9.223373e+18f, 19, 0, "-9223373136366403584"},
+            FromFloatTestParam{9.223373e+14f, 19, 4, "922337313636640.3584"},
+            FromFloatTestParam{-9.223373e+14f, 19, 4, "-922337313636640.3584"},
+            // 2**64 - 2**40 (exactly representable in a float)
+            FromFloatTestParam{1.8446743e+19f, 20, 0, "18446742974197923840"},
+            FromFloatTestParam{-1.8446743e+19f, 20, 0, "-18446742974197923840"},
+            // 2**64 + 2**41 (exactly representable in a float)
+            FromFloatTestParam{1.8446746e+19f, 20, 0, "18446746272732807168"},
+            FromFloatTestParam{-1.8446746e+19f, 20, 0, "-18446746272732807168"},
+            FromFloatTestParam{1.8446746e+15f, 20, 4, "1844674627273280.7168"},
+            FromFloatTestParam{-1.8446746e+15f, 20, 4, "-1844674627273280.7168"},
+            // Almost 10**38 (minus 2**103)
+            FromFloatTestParam{9.999999e+37f, 38, 0,
+                               "99999986661652122824821048795547566080"},
+            FromFloatTestParam{-9.999999e+37f, 38, 0,
+                               "-99999986661652122824821048795547566080"}};
+  }
+};
+TYPED_TEST_SUITE(TestDecimalFromRealFloat, DecimalTypes);
+
+TYPED_TEST(TestDecimalFromRealFloat, SuccessConversion) {
+  for (const auto& param : this->GetValues()) {
+    CheckDecimalFromReal<TypeParam>(param.real, param.precision, param.scale,
+                                    param.expected);
+  }
+}
+
+TYPED_TEST(TestDecimalFromRealFloat, LargeValues) {
   // Test the entire float range
   for (int32_t scale = -38; scale <= 38; ++scale) {
     float real = std::pow(10.0f, static_cast<float>(scale));
-    CheckDecimalFromRealIntegerString(real, 1, -scale, "1");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 1, -scale, "1");
   }
   for (int32_t scale = -37; scale <= 36; ++scale) {
     float real = 123.f * std::pow(10.f, static_cast<float>(scale));
-    CheckDecimalFromRealIntegerString(real, 2, -scale - 1, "12");
-    CheckDecimalFromRealIntegerString(real, 3, -scale, "123");
-    CheckDecimalFromRealIntegerString(real, 4, -scale + 1, "1230");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 2, -scale - 1, "12");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 3, -scale, "123");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 4, -scale + 1, "1230");
+  }
+}
+
+// Tests for Decimal128::FromReal(double, ...) and Decimal256::FromReal(double, ...)
+template <typename T>
+class TestDecimalFromRealDouble : public ::testing::Test {
+ protected:
+  std::vector<FromDoubleTestParam> GetValues() {
+    return {// 2**63 + 2**11 (exactly representable in a double's 53 bits of precision)
+            FromDoubleTestParam{9.223372036854778e+18, 19, 0, "9223372036854777856"},
+            FromDoubleTestParam{-9.223372036854778e+18, 19, 0, "-9223372036854777856"},
+            FromDoubleTestParam{9.223372036854778e+10, 19, 8, "92233720368.54777856"},
+            FromDoubleTestParam{-9.223372036854778e+10, 19, 8, "-92233720368.54777856"},
+            // 2**64 - 2**11 (exactly representable in a double)
+            FromDoubleTestParam{1.844674407370955e+19, 20, 0, "18446744073709549568"},
+            FromDoubleTestParam{-1.844674407370955e+19, 20, 0, "-18446744073709549568"},
+            // 2**64 + 2**11 (exactly representable in a double)
+            FromDoubleTestParam{1.8446744073709556e+19, 20, 0, "18446744073709555712"},
+            FromDoubleTestParam{-1.8446744073709556e+19, 20, 0, "-18446744073709555712"},
+            FromDoubleTestParam{1.8446744073709556e+15, 20, 4, "1844674407370955.5712"},
+            FromDoubleTestParam{-1.8446744073709556e+15, 20, 4, "-1844674407370955.5712"},
+            // Almost 10**38 (minus 2**73)
+            FromDoubleTestParam{9.999999999999998e+37, 38, 0,
+                                "99999999999999978859343891977453174784"},
+            FromDoubleTestParam{-9.999999999999998e+37, 38, 0,
+                                "-99999999999999978859343891977453174784"},
+            FromDoubleTestParam{9.999999999999998e+27, 38, 10,
+                                "9999999999999997885934389197.7453174784"},
+            FromDoubleTestParam{-9.999999999999998e+27, 38, 10,
+                                "-9999999999999997885934389197.7453174784"}};
+  }
+};
+TYPED_TEST_SUITE(TestDecimalFromRealDouble, DecimalTypes);
+
+TYPED_TEST(TestDecimalFromRealDouble, SuccessConversion) {
+  for (const auto& param : this->GetValues()) {
+    CheckDecimalFromReal<TypeParam>(param.real, param.precision, param.scale,
+                                    param.expected);
   }
 }
 
-// Custom test for Decimal128::FromReal(double, ...)
-class TestDecimalFromRealDouble : public ::testing::TestWithParam<FromDoubleTestParam> {};
-
-TEST_P(TestDecimalFromRealDouble, SuccessConversion) {
-  const auto param = GetParam();
-  CheckDecimalFromReal(param.real, param.precision, param.scale, param.expected);
-}
-
-// clang-format off
-INSTANTIATE_TEST_SUITE_P(
-    TestDecimalFromRealDouble, TestDecimalFromRealDouble,
-    ::testing::Values(
-        // 2**63 + 2**11 (exactly representable in a double's 53 bits of precision)
-        FromDoubleTestParam{9.223372036854778e+18, 19, 0, "9223372036854777856"},
-        FromDoubleTestParam{-9.223372036854778e+18, 19, 0, "-9223372036854777856"},
-        FromDoubleTestParam{9.223372036854778e+10, 19, 8, "92233720368.54777856"},
-        FromDoubleTestParam{-9.223372036854778e+10, 19, 8, "-92233720368.54777856"},
-        // 2**64 - 2**11 (exactly representable in a double)
-        FromDoubleTestParam{1.844674407370955e+19, 20, 0, "18446744073709549568"},
-        FromDoubleTestParam{-1.844674407370955e+19, 20, 0, "-18446744073709549568"},
-        // 2**64 + 2**11 (exactly representable in a double)
-        FromDoubleTestParam{1.8446744073709556e+19, 20, 0, "18446744073709555712"},
-        FromDoubleTestParam{-1.8446744073709556e+19, 20, 0, "-18446744073709555712"},
-        FromDoubleTestParam{1.8446744073709556e+15, 20, 4, "1844674407370955.5712"},
-        FromDoubleTestParam{-1.8446744073709556e+15, 20, 4, "-1844674407370955.5712"},
-        // Almost 10**38 (minus 2**73)
-        FromDoubleTestParam{9.999999999999998e+37, 38, 0,
-                            "99999999999999978859343891977453174784"},
-        FromDoubleTestParam{-9.999999999999998e+37, 38, 0,
-                            "-99999999999999978859343891977453174784"},
-        FromDoubleTestParam{9.999999999999998e+27, 38, 10,
-                            "9999999999999997885934389197.7453174784"},
-        FromDoubleTestParam{-9.999999999999998e+27, 38, 10,
-                            "-9999999999999997885934389197.7453174784"}
-));
-// clang-format on
-
-TEST(TestDecimalFromRealDouble, LargeValues) {
+TYPED_TEST(TestDecimalFromRealDouble, LargeValues) {
   // Test the entire double range
   for (int32_t scale = -308; scale <= 308; ++scale) {
     double real = std::pow(10.0, static_cast<double>(scale));
-    CheckDecimalFromRealIntegerString(real, 1, -scale, "1");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 1, -scale, "1");
   }
   for (int32_t scale = -307; scale <= 306; ++scale) {
     double real = 123. * std::pow(10.0, static_cast<double>(scale));
-    CheckDecimalFromRealIntegerString(real, 2, -scale - 1, "12");
-    CheckDecimalFromRealIntegerString(real, 3, -scale, "123");
-    CheckDecimalFromRealIntegerString(real, 4, -scale + 1, "1230");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 2, -scale - 1, "12");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 3, -scale, "123");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 4, -scale + 1, "1230");
+  }
+}
+
+// Additional values that only apply to Decimal256
+TEST(TestDecimal256FromRealDouble, ExtremeValues) {
+  const std::vector<FromDoubleTestParam> values = {
+      // Almost 10**76
+      FromDoubleTestParam{9.999999999999999e+75, 76, 0,
+                          "999999999999999886366330070006442034959750906670402"
+                          "8242075715752105414230016"},
+      FromDoubleTestParam{-9.999999999999999e+75, 76, 0,
+                          "-999999999999999886366330070006442034959750906670402"
+                          "8242075715752105414230016"},
+      FromDoubleTestParam{9.999999999999999e+65, 76, 10,
+                          "999999999999999886366330070006442034959750906670402"
+                          "824207571575210.5414230016"},
+      FromDoubleTestParam{-9.999999999999999e+65, 76, 10,
+                          "-999999999999999886366330070006442034959750906670402"
+                          "824207571575210.5414230016"}};
+  for (const auto& param : values) {
+    CheckDecimalFromReal<Decimal256>(param.real, param.precision, param.scale,
+                                     param.expected);
   }
 }
 
@@ -661,30 +693,36 @@ struct ToRealTestParam {
 using ToFloatTestParam = ToRealTestParam<float>;
 using ToDoubleTestParam = ToRealTestParam<double>;
 
-template <typename Real>
+template <typename Decimal, typename Real>
 void CheckDecimalToReal(const std::string& decimal_value, int32_t scale, Real expected) {
-  Decimal128 dec(decimal_value);
-  ASSERT_EQ(dec.ToReal<Real>(scale), expected);
+  Decimal dec(decimal_value);
+  ASSERT_EQ(dec.template ToReal<Real>(scale), expected)
+      << "Decimal value: " << decimal_value << " Scale: " << scale;
 }
 
-void CheckFloatToRealApprox(const std::string& decimal_value, int32_t scale,
-                            float expected) {
-  Decimal128 dec(decimal_value);
-  ASSERT_FLOAT_EQ(dec.ToReal<float>(scale), expected);
+template <typename Decimal>
+void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale,
+                              float expected) {
+  Decimal dec(decimal_value);
+  ASSERT_FLOAT_EQ(dec.template ToReal<float>(scale), expected)
+      << "Decimal value: " << decimal_value << " Scale: " << scale;
 }
 
-void CheckDoubleToRealApprox(const std::string& decimal_value, int32_t scale,
-                             double expected) {
-  Decimal128 dec(decimal_value);
-  ASSERT_DOUBLE_EQ(dec.ToReal<double>(scale), expected);
+template <typename Decimal>
+void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale,
+                              double expected) {
+  Decimal dec(decimal_value);
+  ASSERT_DOUBLE_EQ(dec.template ToReal<double>(scale), expected)
+      << "Decimal value: " << decimal_value << " Scale: " << scale;
 }
 
-// Common tests for Decimal128::ToReal<T>
+// Common tests for Decimal128::ToReal<T> and Decimal256::ToReal<T>
 template <typename T>
 class TestDecimalToReal : public ::testing::Test {
  public:
-  using Real = T;
-  using ParamType = ToRealTestParam<T>;
+  using Decimal = typename T::first_type;
+  using Real = typename T::second_type;
+  using ParamType = ToRealTestParam<Real>;
 
   Real Pow2(int exp) { return std::pow(static_cast<Real>(2), static_cast<Real>(exp)); }
 
@@ -715,12 +753,45 @@ class TestDecimalToReal : public ::testing::Test {
         // clang-format on
     };
     for (const ParamType& param : params) {
-      CheckDecimalToReal<Real>(param.decimal_value, param.scale, param.expected);
+      CheckDecimalToReal<Decimal, Real>(param.decimal_value, param.scale, param.expected);
       if (param.decimal_value != "0") {
-        CheckDecimalToReal<Real>("-" + param.decimal_value, param.scale, -param.expected);
+        CheckDecimalToReal<Decimal, Real>("-" + param.decimal_value, param.scale,
+                                          -param.expected);
       }
     }
   }
+
+  // Test precision of conversions to float values
+  void TestPrecision() {
+    // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision)
+    CheckDecimalToReal<Decimal, Real>("9223373136366403584", 0, 9.223373e+18f);
+    CheckDecimalToReal<Decimal, Real>("-9223373136366403584", 0, -9.223373e+18f);
+    // 2**64 + 2**41 (exactly representable in a float)
+    CheckDecimalToReal<Decimal, Real>("18446746272732807168", 0, 1.8446746e+19f);
+    CheckDecimalToReal<Decimal, Real>("-18446746272732807168", 0, -1.8446746e+19f);
+  }
+
+  // Test conversions with a range of scales
+  void TestLargeValues(int32_t max_scale) {
+    // Note that exact comparisons would succeed on some platforms (Linux, macOS).
+    // Nevertheless, power-of-ten factors are not all exactly representable
+    // in binary floating point.
+    for (int32_t scale = -max_scale; scale <= max_scale; scale++) {
+#ifdef _WIN32
+      // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero
+      if (scale == 45) continue;
+#endif
+      CheckDecimalToRealApprox<Decimal>("1", scale, Pow10(-scale));
+    }
+    for (int32_t scale = -max_scale; scale <= max_scale - 2; scale++) {
+#ifdef _WIN32
+      // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero
+      if (scale == 45) continue;
+#endif
+      const Real factor = static_cast<Real>(123);
+      CheckDecimalToRealApprox<Decimal>("123", scale, factor * Pow10(-scale));
+    }
+  }
 };
 
 TYPED_TEST_SUITE(TestDecimalToReal, RealTypes);
@@ -728,72 +799,65 @@ TYPED_TEST_SUITE(TestDecimalToReal, RealTypes);
 TYPED_TEST(TestDecimalToReal, TestSuccess) { this->TestSuccess(); }
 
 // Custom test for Decimal128::ToReal<float>
-class TestDecimalToRealFloat : public TestDecimalToReal<float> {};
-
-TEST_F(TestDecimalToRealFloat, LargeValues) {
-  // Note that exact comparisons would succeed on some platforms (Linux, macOS).
-  // Nevertheless, power-of-ten factors are not all exactly representable
-  // in binary floating point.
-  for (int32_t scale = -38; scale <= 38; scale++) {
-    CheckFloatToRealApprox("1", scale, Pow10(-scale));
-  }
-  for (int32_t scale = -38; scale <= 36; scale++) {
-    const Real factor = static_cast<Real>(123);
-    CheckFloatToRealApprox("123", scale, factor * Pow10(-scale));
-  }
-}
-
-TEST_F(TestDecimalToRealFloat, Precision) {
-  // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision)
-  CheckDecimalToReal<float>("9223373136366403584", 0, 9.223373e+18f);
-  CheckDecimalToReal<float>("-9223373136366403584", 0, -9.223373e+18f);
-  // 2**64 + 2**41 (exactly representable in a float)
-  CheckDecimalToReal<float>("18446746272732807168", 0, 1.8446746e+19f);
-  CheckDecimalToReal<float>("-18446746272732807168", 0, -1.8446746e+19f);
-}
+class TestDecimal128ToRealFloat : public TestDecimalToReal<std::pair<Decimal128, float>> {
+};
+TEST_F(TestDecimal128ToRealFloat, LargeValues) { TestLargeValues(/*max_scale=*/38); }
+TEST_F(TestDecimal128ToRealFloat, Precision) { this->TestPrecision(); }
+// Custom test for Decimal256::ToReal<float>
+class TestDecimal256ToRealFloat : public TestDecimalToReal<std::pair<Decimal256, float>> {
+};
+TEST_F(TestDecimal256ToRealFloat, LargeValues) { TestLargeValues(/*max_scale=*/76); }
+TEST_F(TestDecimal256ToRealFloat, Precision) { this->TestPrecision(); }
 
 // ToReal<double> tests are disabled on MinGW because of precision issues in results
 #ifndef __MINGW32__
 
 // Custom test for Decimal128::ToReal<double>
-class TestDecimalToRealDouble : public TestDecimalToReal<double> {};
+template <typename DecimalType>
+class TestDecimalToRealDouble : public TestDecimalToReal<std::pair<DecimalType, double>> {
+};
+TYPED_TEST_SUITE(TestDecimalToRealDouble, DecimalTypes);
 
-TEST_F(TestDecimalToRealDouble, LargeValues) {
+TYPED_TEST(TestDecimalToRealDouble, LargeValues) {
   // Note that exact comparisons would succeed on some platforms (Linux, macOS).
   // Nevertheless, power-of-ten factors are not all exactly representable
   // in binary floating point.
   for (int32_t scale = -308; scale <= 308; scale++) {
-    CheckDoubleToRealApprox("1", scale, Pow10(-scale));
+    CheckDecimalToRealApprox<TypeParam>("1", scale, this->Pow10(-scale));
   }
   for (int32_t scale = -308; scale <= 306; scale++) {
-    const Real factor = static_cast<Real>(123);
-    CheckDoubleToRealApprox("123", scale, factor * Pow10(-scale));
+    const double factor = 123.;
+    CheckDecimalToRealApprox<TypeParam>("123", scale, factor * this->Pow10(-scale));
   }
 }
 
-TEST_F(TestDecimalToRealDouble, Precision) {
+TYPED_TEST(TestDecimalToRealDouble, Precision) {
   // 2**63 + 2**11 (exactly representable in a double's 53 bits of precision)
-  CheckDecimalToReal<double>("9223372036854777856", 0, 9.223372036854778e+18);
-  CheckDecimalToReal<double>("-9223372036854777856", 0, -9.223372036854778e+18);
+  CheckDecimalToReal<TypeParam, double>("9223372036854777856", 0, 9.223372036854778e+18);
+  CheckDecimalToReal<TypeParam, double>("-9223372036854777856", 0,
+                                        -9.223372036854778e+18);
   // 2**64 - 2**11 (exactly representable in a double)
-  CheckDecimalToReal<double>("18446744073709549568", 0, 1.844674407370955e+19);
-  CheckDecimalToReal<double>("-18446744073709549568", 0, -1.844674407370955e+19);
+  CheckDecimalToReal<TypeParam, double>("18446744073709549568", 0, 1.844674407370955e+19);
+  CheckDecimalToReal<TypeParam, double>("-18446744073709549568", 0,
+                                        -1.844674407370955e+19);
   // 2**64 + 2**11 (exactly representable in a double)
-  CheckDecimalToReal<double>("18446744073709555712", 0, 1.8446744073709556e+19);
-  CheckDecimalToReal<double>("-18446744073709555712", 0, -1.8446744073709556e+19);
+  CheckDecimalToReal<TypeParam, double>("18446744073709555712", 0,
+                                        1.8446744073709556e+19);
+  CheckDecimalToReal<TypeParam, double>("-18446744073709555712", 0,
+                                        -1.8446744073709556e+19);
   // Almost 10**38 (minus 2**73)
-  CheckDecimalToReal<double>("99999999999999978859343891977453174784", 0,
-                             9.999999999999998e+37);
-  CheckDecimalToReal<double>("-99999999999999978859343891977453174784", 0,
-                             -9.999999999999998e+37);
-  CheckDecimalToReal<double>("99999999999999978859343891977453174784", 10,
-                             9.999999999999998e+27);
-  CheckDecimalToReal<double>("-99999999999999978859343891977453174784", 10,
-                             -9.999999999999998e+27);
-  CheckDecimalToReal<double>("99999999999999978859343891977453174784", -10,
-                             9.999999999999998e+47);
-  CheckDecimalToReal<double>("-99999999999999978859343891977453174784", -10,
-                             -9.999999999999998e+47);
+  CheckDecimalToReal<TypeParam, double>("99999999999999978859343891977453174784", 0,
+                                        9.999999999999998e+37);
+  CheckDecimalToReal<TypeParam, double>("-99999999999999978859343891977453174784", 0,
+                                        -9.999999999999998e+37);
+  CheckDecimalToReal<TypeParam, double>("99999999999999978859343891977453174784", 10,
+                                        9.999999999999998e+27);
+  CheckDecimalToReal<TypeParam, double>("-99999999999999978859343891977453174784", 10,
+                                        -9.999999999999998e+27);
+  CheckDecimalToReal<TypeParam, double>("99999999999999978859343891977453174784", -10,
+                                        9.999999999999998e+47);
+  CheckDecimalToReal<TypeParam, double>("-99999999999999978859343891977453174784", -10,
+                                        -9.999999999999998e+47);
 }
 
 #endif  // __MINGW32__
@@ -1455,9 +1519,7 @@ TEST(Decimal256Test, Divide) {
       Decimal256 decimal_x = Decimal256FromInt128(x);
       Decimal256 decimal_y = Decimal256FromInt128(y);
       Decimal256 result = decimal_x / decimal_y;
-      EXPECT_EQ(Decimal256FromInt128(x / y), result)
-          << " x: " << decimal_x.ToIntegerString()
-          << " y: " << decimal_y.ToIntegerString();
+      EXPECT_EQ(Decimal256FromInt128(x / y), result);
     }
   }
 }
@@ -1495,6 +1557,44 @@ TEST(Decimal256Test, Rescale) {
   }
 }
 
+TEST(Decimal256Test, IncreaseScale) {
+  Decimal256 result;
+
+  result = Decimal256("1234").IncreaseScaleBy(0);
+  ASSERT_EQ("1234", result.ToIntegerString());
+
+  result = Decimal256("1234").IncreaseScaleBy(3);
+  ASSERT_EQ("1234000", result.ToIntegerString());
+
+  result = Decimal256("-1234").IncreaseScaleBy(3);
+  ASSERT_EQ("-1234000", result.ToIntegerString());
+}
+
+TEST(Decimal256Test, ReduceScaleAndRound) {
+  Decimal256 result;
+
+  result = Decimal256("123456").ReduceScaleBy(0);
+  ASSERT_EQ("123456", result.ToIntegerString());
+
+  result = Decimal256("123456").ReduceScaleBy(1, false);
+  ASSERT_EQ("12345", result.ToIntegerString());
+
+  result = Decimal256("123456").ReduceScaleBy(1, true);
+  ASSERT_EQ("12346", result.ToIntegerString());
+
+  result = Decimal256("123451").ReduceScaleBy(1, true);
+  ASSERT_EQ("12345", result.ToIntegerString());
+
+  result = Decimal256("-123789").ReduceScaleBy(2, true);
+  ASSERT_EQ("-1238", result.ToIntegerString());
+
+  result = Decimal256("-123749").ReduceScaleBy(2, true);
+  ASSERT_EQ("-1237", result.ToIntegerString());
+
+  result = Decimal256("-123750").ReduceScaleBy(2, true);
+  ASSERT_EQ("-1238", result.ToIntegerString());
+}
+
 TEST(Decimal256, FromBigEndianTest) {
   // We test out a variety of scenarios:
   //
diff --git a/cpp/src/arrow/util/future.cc b/cpp/src/arrow/util/future.cc
index 501d5ca9ee8..90e8db3c6a6 100644
--- a/cpp/src/arrow/util/future.cc
+++ b/cpp/src/arrow/util/future.cc
@@ -39,6 +39,8 @@ using internal::checked_cast;
 // should ideally not limit scalability.
 static std::mutex global_waiter_mutex;
 
+const double FutureWaiter::kInfinity = HUGE_VAL;
+
 class FutureWaiterImpl : public FutureWaiter {
  public:
   FutureWaiterImpl(Kind kind, std::vector<FutureImpl*> futures)
diff --git a/cpp/src/arrow/util/future.h b/cpp/src/arrow/util/future.h
index 4ede2912e6d..21754ec073a 100644
--- a/cpp/src/arrow/util/future.h
+++ b/cpp/src/arrow/util/future.h
@@ -27,6 +27,7 @@
 
 #include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/functional.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/optional.h"
@@ -175,7 +176,9 @@ class ARROW_EXPORT FutureWaiter {
  public:
   enum Kind : int8_t { ANY, ALL, ALL_OR_FIRST_FAILED, ITERATE };
 
-  static constexpr double kInfinity = HUGE_VAL;
+  // HUGE_VAL isn't constexpr on Windows
+  // https://social.msdn.microsoft.com/Forums/vstudio/en-US/47e8b9ff-b205-4189-968e-ee3bc3e2719f/constexpr-compile-error?forum=vclanguage
+  static const double kInfinity;
 
   static std::unique_ptr<FutureWaiter> Make(Kind kind, std::vector<FutureImpl*> futures);
 
diff --git a/cpp/src/arrow/util/iterator.h b/cpp/src/arrow/util/iterator.h
index 771b209a406..568cb1f5cd1 100644
--- a/cpp/src/arrow/util/iterator.h
+++ b/cpp/src/arrow/util/iterator.h
@@ -44,9 +44,23 @@ struct IterationTraits {
   /// \brief a reserved value which indicates the end of iteration. By
   /// default this is NULLPTR since most iterators yield pointer types.
   /// Specialize IterationTraits if different end semantics are required.
+  ///
+  /// Note: This should not be used to determine if a given value is a
+  /// terminal value.  Use IsIterationEnd (which uses IsEnd) instead.  This
+  /// is only for returning terminal values.
   static T End() { return T(NULLPTR); }
+
+  /// \brief Checks to see if the value is a terminal value.
+  /// A method is used here since T is not neccesarily comparable in many
+  /// cases even though it has a distinct final value
+  static bool IsEnd(const T& val) { return val == End(); }
 };
 
+template <typename T>
+bool IsIterationEnd(const T& val) {
+  return IterationTraits<T>::IsEnd(val);
+}
+
 template <typename T>
 struct IterationTraits<util::optional<T>> {
   /// \brief by default when iterating through a sequence of optional,
@@ -54,6 +68,11 @@ struct IterationTraits<util::optional<T>> {
   /// Specialize IterationTraits if different end semantics are required.
   static util::optional<T> End() { return util::nullopt; }
 
+  /// \brief by default when iterating through a sequence of optional,
+  /// nullopt (!has_value()) indicates the end of iteration.
+  /// Specialize IterationTraits if different end semantics are required.
+  static bool IsEnd(const util::optional<T>& val) { return !val.has_value(); }
+
   // TODO(bkietz) The range-for loop over Iterator<optional<T>> yields
   // Result<optional<T>> which is unnecessary (since only the unyielded end optional
   // is nullopt. Add IterationTraits::GetRangeElement() to handle this case
@@ -90,12 +109,10 @@ class Iterator : public util::EqualityComparable<Iterator<T>> {
   /// returned by the visitor, terminating iteration.
   template <typename Visitor>
   Status Visit(Visitor&& visitor) {
-    const auto end = IterationTraits<T>::End();
-
     for (;;) {
       ARROW_ASSIGN_OR_RAISE(auto value, Next());
 
-      if (value == end) break;
+      if (IsIterationEnd(value)) break;
 
       ARROW_RETURN_NOT_OK(visitor(std::move(value)));
     }
@@ -266,7 +283,7 @@ class TransformIterator {
       }
       auto next = *next_res;
       if (next.ReadyForNext()) {
-        if (*last_value_ == IterationTraits<T>::End()) {
+        if (IsIterationEnd(*last_value_)) {
           finished_ = true;
         }
         last_value_.reset();
@@ -314,6 +331,7 @@ struct IterationTraits<Iterator<T>> {
   // The end condition for an Iterator of Iterators is a default constructed (null)
   // Iterator.
   static Iterator<T> End() { return Iterator<T>(); }
+  static bool IsEnd(const Iterator<T>& val) { return !val; }
 };
 
 template <typename Fn, typename T>
@@ -405,7 +423,7 @@ class MapIterator {
   Result<O> Next() {
     ARROW_ASSIGN_OR_RAISE(I i, it_.Next());
 
-    if (i == IterationTraits<I>::End()) {
+    if (IsIterationEnd(i)) {
       return IterationTraits<O>::End();
     }
 
@@ -467,7 +485,7 @@ struct FilterIterator {
       for (;;) {
         ARROW_ASSIGN_OR_RAISE(From i, it_.Next());
 
-        if (i == IterationTraits<From>::End()) {
+        if (IsIterationEnd(i)) {
           return IterationTraits<To>::End();
         }
 
@@ -503,12 +521,12 @@ class FlattenIterator {
   explicit FlattenIterator(Iterator<Iterator<T>> it) : parent_(std::move(it)) {}
 
   Result<T> Next() {
-    if (child_ == IterationTraits<Iterator<T>>::End()) {
+    if (IsIterationEnd(child_)) {
       // Pop from parent's iterator.
       ARROW_ASSIGN_OR_RAISE(child_, parent_.Next());
 
       // Check if final iteration reached.
-      if (child_ == IterationTraits<Iterator<T>>::End()) {
+      if (IsIterationEnd(child_)) {
         return IterationTraits<T>::End();
       }
 
@@ -517,7 +535,7 @@ class FlattenIterator {
 
     // Pop from child_ and check for depletion.
     ARROW_ASSIGN_OR_RAISE(T out, child_.Next());
-    if (out == IterationTraits<T>::End()) {
+    if (IsIterationEnd(out)) {
       // Reset state such that we pop from parent on the recursive call
       child_ = IterationTraits<Iterator<T>>::End();
 
diff --git a/cpp/src/arrow/util/iterator_test.cc b/cpp/src/arrow/util/iterator_test.cc
index 0cd8767bf87..60b57dea1e2 100644
--- a/cpp/src/arrow/util/iterator_test.cc
+++ b/cpp/src/arrow/util/iterator_test.cc
@@ -26,57 +26,13 @@
 #include <unordered_set>
 #include <vector>
 
-#include "arrow/testing/future_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/util/async_generator.h"
 #include "arrow/util/iterator.h"
-
+#include "arrow/util/test_common.h"
+#include "arrow/util/vector.h"
 namespace arrow {
 
-struct TestInt {
-  TestInt() : value(-999) {}
-  TestInt(int i) : value(i) {}  // NOLINT runtime/explicit
-  int value;
-
-  bool operator==(const TestInt& other) const { return value == other.value; }
-
-  friend std::ostream& operator<<(std::ostream& os, const TestInt& v) {
-    os << "{" << v.value << "}";
-    return os;
-  }
-};
-
-template <>
-struct IterationTraits<TestInt> {
-  static TestInt End() { return TestInt(); }
-};
-
-struct TestStr {
-  TestStr() : value("") {}
-  TestStr(const std::string& s) : value(s) {}  // NOLINT runtime/explicit
-  TestStr(const char* s) : value(s) {}         // NOLINT runtime/explicit
-  explicit TestStr(const TestInt& test_int) {
-    if (test_int == IterationTraits<TestInt>::End()) {
-      value = "";
-    } else {
-      value = std::to_string(test_int.value);
-    }
-  }
-  std::string value;
-
-  bool operator==(const TestStr& other) const { return value == other.value; }
-
-  friend std::ostream& operator<<(std::ostream& os, const TestStr& v) {
-    os << "{\"" << v.value << "\"}";
-    return os;
-  }
-};
-
-template <>
-struct IterationTraits<TestStr> {
-  static TestStr End() { return TestStr(); }
-};
-
 template <typename T>
 class TracingIterator {
  public:
@@ -157,54 +113,12 @@ template <typename T>
 inline Iterator<T> EmptyIt() {
   return MakeEmptyIterator<T>();
 }
+
+// Non-templated version of VectorIt<T> to allow better type deduction
 inline Iterator<TestInt> VectorIt(std::vector<TestInt> v) {
   return MakeVectorIterator<TestInt>(std::move(v));
 }
 
-AsyncGenerator<TestInt> AsyncVectorIt(std::vector<TestInt> v) {
-  size_t index = 0;
-  return [index, v]() mutable -> Future<TestInt> {
-    if (index >= v.size()) {
-      return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
-    }
-    return Future<TestInt>::MakeFinished(v[index++]);
-  };
-}
-
-constexpr auto kYieldDuration = std::chrono::microseconds(50);
-
-// Yields items with a small pause between each one from a background thread
-std::function<Future<TestInt>()> BackgroundAsyncVectorIt(std::vector<TestInt> v,
-                                                         bool sleep = true) {
-  auto pool = internal::GetCpuThreadPool();
-  auto iterator = VectorIt(v);
-  auto slow_iterator = MakeTransformedIterator<TestInt, TestInt>(
-      std::move(iterator), [sleep](TestInt item) -> Result<TransformFlow<TestInt>> {
-        if (sleep) {
-          std::this_thread::sleep_for(kYieldDuration);
-        }
-        return TransformYield(item);
-      });
-
-  EXPECT_OK_AND_ASSIGN(auto background,
-                       MakeBackgroundGenerator<TestInt>(std::move(slow_iterator),
-                                                        internal::GetCpuThreadPool()));
-  return MakeTransferredGenerator(background, pool);
-}
-
-std::vector<TestInt> RangeVector(unsigned int max) {
-  std::vector<TestInt> range(max);
-  for (unsigned int i = 0; i < max; i++) {
-    range[i] = i;
-  }
-  return range;
-}
-
-template <typename T>
-inline Iterator<T> VectorIt(std::vector<T> v) {
-  return MakeVectorIterator<T>(std::move(v));
-}
-
 template <typename Fn, typename T>
 inline Iterator<T> FilterIt(Iterator<T> it, Fn&& fn) {
   return MakeFilterIterator(std::forward<Fn>(fn), std::move(it));
@@ -220,13 +134,6 @@ void AssertIteratorMatch(std::vector<T> expected, Iterator<T> actual) {
   EXPECT_EQ(expected, IteratorToVector(std::move(actual)));
 }
 
-template <typename T>
-void AssertAsyncGeneratorMatch(std::vector<T> expected, AsyncGenerator<T> actual) {
-  auto vec_future = CollectAsyncGenerator(std::move(actual));
-  EXPECT_OK_AND_ASSIGN(auto vec, vec_future.result());
-  EXPECT_EQ(expected, vec);
-}
-
 template <typename T>
 void AssertIteratorNoMatch(std::vector<T> expected, Iterator<T> actual) {
   EXPECT_NE(expected, IteratorToVector(std::move(actual)));
@@ -238,11 +145,6 @@ void AssertIteratorNext(T expected, Iterator<T>& it) {
   ASSERT_EQ(expected, actual);
 }
 
-template <typename T>
-void AssertIteratorExhausted(Iterator<T>& it) {
-  AssertIteratorNext(IterationTraits<T>::End(), it);
-}
-
 // --------------------------------------------------------------------
 // Synchronous iterator tests
 
@@ -336,16 +238,6 @@ TEST(TestIteratorTransform, TruncatingShort) {
   AssertIteratorMatch({"1"}, std::move(truncated));
 }
 
-Transformer<TestInt, TestStr> MakeFilter(std::function<bool(TestInt&)> filter) {
-  return [filter](TestInt next) -> Result<TransformFlow<TestStr>> {
-    if (filter(next)) {
-      return TransformYield(TestStr(next));
-    } else {
-      return TransformSkip();
-    }
-  };
-}
-
 TEST(TestIteratorTransform, SkipSome) {
   // Exercises TransformSkip
   auto original = VectorIt({1, 2, 3});
@@ -378,7 +270,7 @@ TEST(TestIteratorTransform, Abort) {
   ASSERT_OK(transformed.Next());
   ASSERT_RAISES(Invalid, transformed.Next());
   ASSERT_OK_AND_ASSIGN(auto third, transformed.Next());
-  ASSERT_EQ(IterationTraits<TestStr>::End(), third);
+  ASSERT_TRUE(IsIterationEnd(third));
 }
 
 template <typename T>
@@ -499,10 +391,6 @@ TEST(ReadaheadIterator, NotExhausted) {
   AssertIteratorNext({2}, it);
 }
 
-void SleepABit(double seconds = 1e-3) {
-  std::this_thread::sleep_for(std::chrono::duration<double>(seconds));
-}
-
 TEST(ReadaheadIterator, Trace) {
   TracingIterator<TestInt> tracing_it(VectorIt({1, 2, 3, 4, 5, 6, 7, 8}));
   auto tracing = tracing_it.state();
@@ -573,513 +461,4 @@ TEST(ReadaheadIterator, NextError) {
   AssertIteratorExhausted(it);
 }
 
-// --------------------------------------------------------------------
-// Asynchronous iterator tests
-
-TEST(TestAsyncUtil, Visit) {
-  auto generator = AsyncVectorIt({1, 2, 3});
-  unsigned int sum = 0;
-  auto sum_future = VisitAsyncGenerator<TestInt>(generator, [&sum](TestInt item) {
-    sum += item.value;
-    return Status::OK();
-  });
-  ASSERT_TRUE(sum_future.is_finished());
-  ASSERT_EQ(6, sum);
-}
-
-TEST(TestAsyncUtil, Collect) {
-  std::vector<TestInt> expected = {1, 2, 3};
-  auto generator = AsyncVectorIt(expected);
-  auto collected = CollectAsyncGenerator(generator);
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto collected_val, collected);
-  ASSERT_EQ(expected, collected_val);
-}
-
-TEST(TestAsyncUtil, SynchronousFinish) {
-  AsyncGenerator<TestInt> generator = []() {
-    return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
-  };
-  Transformer<TestInt, TestStr> skip_all = [](TestInt value) { return TransformSkip(); };
-  auto transformed = MakeAsyncGenerator(generator, skip_all);
-  auto future = CollectAsyncGenerator(transformed);
-  ASSERT_TRUE(future.is_finished());
-  ASSERT_OK_AND_ASSIGN(auto actual, future.result());
-  ASSERT_EQ(std::vector<TestStr>(), actual);
-}
-
-TEST(TestAsyncUtil, GeneratorIterator) {
-  auto generator = BackgroundAsyncVectorIt({1, 2, 3});
-  ASSERT_OK_AND_ASSIGN(auto iterator, MakeGeneratorIterator(std::move(generator)));
-  ASSERT_OK_AND_EQ(TestInt(1), iterator.Next());
-  ASSERT_OK_AND_EQ(TestInt(2), iterator.Next());
-  ASSERT_OK_AND_EQ(TestInt(3), iterator.Next());
-  ASSERT_OK_AND_EQ(IterationTraits<TestInt>::End(), iterator.Next());
-  ASSERT_OK_AND_EQ(IterationTraits<TestInt>::End(), iterator.Next());
-}
-
-TEST(TestAsyncUtil, MakeTransferredGenerator) {
-  std::mutex mutex;
-  std::condition_variable cv;
-  std::atomic<bool> finished(false);
-
-  ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(1));
-
-  // Needs to be a slow source to ensure we don't call Then on a completed
-  AsyncGenerator<TestInt> slow_generator = [&]() {
-    return thread_pool
-        ->Submit([&] {
-          std::unique_lock<std::mutex> lock(mutex);
-          cv.wait_for(lock, std::chrono::duration<double>(30),
-                      [&] { return finished.load(); });
-          return IterationTraits<TestInt>::End();
-        })
-        .ValueOrDie();
-  };
-
-  auto transferred =
-      MakeTransferredGenerator<TestInt>(std::move(slow_generator), thread_pool.get());
-
-  auto current_thread_id = std::this_thread::get_id();
-  auto fut = transferred().Then([&current_thread_id](const Result<TestInt>& result) {
-    ASSERT_NE(current_thread_id, std::this_thread::get_id());
-  });
-
-  {
-    std::lock_guard<std::mutex> lg(mutex);
-    finished.store(true);
-  }
-  cv.notify_one();
-  ASSERT_FINISHES_OK(fut);
-}
-
-// This test is too slow for valgrind
-#if !(defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER))
-
-TEST(TestAsyncUtil, StackOverflow) {
-  int counter = 0;
-  AsyncGenerator<TestInt> generator = [&counter]() {
-    if (counter < 1000000) {
-      return Future<TestInt>::MakeFinished(counter++);
-    } else {
-      return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
-    }
-  };
-  Transformer<TestInt, TestStr> discard =
-      [](TestInt next) -> Result<TransformFlow<TestStr>> { return TransformSkip(); };
-  auto transformed = MakeAsyncGenerator(generator, discard);
-  auto collected_future = CollectAsyncGenerator(transformed);
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto collected, collected_future);
-  ASSERT_EQ(0, collected.size());
-}
-
-#endif
-
-TEST(TestAsyncUtil, Background) {
-  std::vector<TestInt> expected = {1, 2, 3};
-  auto background = BackgroundAsyncVectorIt(expected);
-  auto future = CollectAsyncGenerator(background);
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto collected, future);
-  ASSERT_EQ(expected, collected);
-}
-
-struct SlowEmptyIterator {
-  Result<TestInt> Next() {
-    if (called_) {
-      return Status::Invalid("Should not have been called twice");
-    }
-    SleepFor(0.1);
-    return IterationTraits<TestInt>::End();
-  }
-
- private:
-  bool called_ = false;
-};
-
-TEST(TestAsyncUtil, BackgroundRepeatEnd) {
-  // Ensure that the background generator properly fulfills the asyncgenerator contract
-  // and can be called after it ends.
-  ASSERT_OK_AND_ASSIGN(auto io_pool, internal::ThreadPool::Make(1));
-
-  auto iterator = Iterator<TestInt>(SlowEmptyIterator());
-  ASSERT_OK_AND_ASSIGN(auto background_gen,
-                       MakeBackgroundGenerator(std::move(iterator), io_pool.get()));
-
-  background_gen =
-      MakeTransferredGenerator(std::move(background_gen), internal::GetCpuThreadPool());
-
-  auto one = background_gen();
-  auto two = background_gen();
-
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto one_fin, one);
-  ASSERT_EQ(IterationTraits<TestInt>::End(), one_fin);
-
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto two_fin, two);
-  ASSERT_EQ(IterationTraits<TestInt>::End(), two_fin);
-}
-
-TEST(TestAsyncUtil, CompleteBackgroundStressTest) {
-  auto expected = RangeVector(20);
-  std::vector<Future<std::vector<TestInt>>> futures;
-  for (unsigned int i = 0; i < 20; i++) {
-    auto background = BackgroundAsyncVectorIt(expected);
-    futures.push_back(CollectAsyncGenerator(background));
-  }
-  auto combined = All(futures);
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto completed_vectors, combined);
-  for (std::size_t i = 0; i < completed_vectors.size(); i++) {
-    ASSERT_OK_AND_ASSIGN(auto vector, completed_vectors[i]);
-    ASSERT_EQ(vector, expected);
-  }
-}
-
-template <typename T>
-class ReentrantChecker {
- public:
-  explicit ReentrantChecker(AsyncGenerator<T> source)
-      : state_(std::make_shared<State>(std::move(source))) {}
-
-  Future<T> operator()() {
-    if (state_->in.load()) {
-      state_->valid.store(false);
-    }
-    state_->in.store(true);
-    auto result = state_->source();
-    return result.Then(Callback{state_});
-  }
-
-  void AssertValid() {
-    EXPECT_EQ(true, state_->valid.load())
-        << "The generator was accessed in a reentrant manner";
-  }
-
- private:
-  struct State {
-    explicit State(AsyncGenerator<T> source_)
-        : source(std::move(source_)), in(false), valid(true) {}
-
-    AsyncGenerator<T> source;
-    std::atomic<bool> in;
-    std::atomic<bool> valid;
-  };
-  struct Callback {
-    Future<T> operator()(const Result<T>& result) {
-      state_->in.store(false);
-      return result;
-    }
-    std::shared_ptr<State> state_;
-  };
-
-  std::shared_ptr<State> state_;
-};
-
-TEST(TestAsyncUtil, SerialReadaheadSlowProducer) {
-  AsyncGenerator<TestInt> it = BackgroundAsyncVectorIt({1, 2, 3, 4, 5});
-  ReentrantChecker<TestInt> checker(std::move(it));
-  SerialReadaheadGenerator<TestInt> serial_readahead(checker, 2);
-  AssertAsyncGeneratorMatch({1, 2, 3, 4, 5},
-                            static_cast<AsyncGenerator<TestInt>>(serial_readahead));
-  checker.AssertValid();
-}
-
-TEST(TestAsyncUtil, SerialReadaheadSlowConsumer) {
-  int num_delivered = 0;
-  auto source = [&num_delivered]() {
-    if (num_delivered < 5) {
-      return Future<TestInt>::MakeFinished(num_delivered++);
-    } else {
-      return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
-    }
-  };
-  SerialReadaheadGenerator<TestInt> serial_readahead(std::move(source), 3);
-  SleepABit();
-  ASSERT_EQ(0, num_delivered);
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto next, serial_readahead());
-  ASSERT_EQ(0, next.value);
-  ASSERT_EQ(3, num_delivered);
-  AssertAsyncGeneratorMatch({1, 2, 3, 4},
-                            static_cast<AsyncGenerator<TestInt>>(serial_readahead));
-}
-
-TEST(TestAsyncUtil, SerialReadaheadStress) {
-  constexpr int NTASKS = 20;
-  constexpr int NITEMS = 50;
-  for (int i = 0; i < NTASKS; i++) {
-    AsyncGenerator<TestInt> it = BackgroundAsyncVectorIt(RangeVector(NITEMS));
-    ReentrantChecker<TestInt> checker(std::move(it));
-    SerialReadaheadGenerator<TestInt> serial_readahead(checker, 2);
-    auto visit_fut =
-        VisitAsyncGenerator<TestInt>(serial_readahead, [](TestInt test_int) -> Status {
-          // Normally sleeping in a visit function would be a faux-pas but we want to slow
-          // the reader down to match the producer to maximize the stress
-          std::this_thread::sleep_for(kYieldDuration);
-          return Status::OK();
-        });
-    ASSERT_FINISHES_OK(visit_fut);
-    checker.AssertValid();
-  }
-}
-
-TEST(TestAsyncUtil, SerialReadaheadStressFast) {
-  constexpr int NTASKS = 20;
-  constexpr int NITEMS = 50;
-  for (int i = 0; i < NTASKS; i++) {
-    AsyncGenerator<TestInt> it = BackgroundAsyncVectorIt(RangeVector(NITEMS), false);
-    ReentrantChecker<TestInt> checker(std::move(it));
-    SerialReadaheadGenerator<TestInt> serial_readahead(checker, 2);
-    auto visit_fut = VisitAsyncGenerator<TestInt>(
-        serial_readahead, [](TestInt test_int) -> Status { return Status::OK(); });
-    ASSERT_FINISHES_OK(visit_fut);
-    checker.AssertValid();
-  }
-}
-
-TEST(TestAsyncUtil, SerialReadaheadStressFailing) {
-  constexpr int NTASKS = 20;
-  constexpr int NITEMS = 50;
-  constexpr int EXPECTED_SUM = 45;
-  for (int i = 0; i < NTASKS; i++) {
-    AsyncGenerator<TestInt> it = BackgroundAsyncVectorIt(RangeVector(NITEMS));
-    AsyncGenerator<TestInt> fails_at_ten = [&it]() {
-      auto next = it();
-      return next.Then([](const Result<TestInt>& item) -> Result<TestInt> {
-        if (item->value >= 10) {
-          return Status::Invalid("XYZ");
-        } else {
-          return item;
-        }
-      });
-    };
-    SerialReadaheadGenerator<TestInt> serial_readahead(fails_at_ten, 2);
-    unsigned int sum = 0;
-    auto visit_fut = VisitAsyncGenerator<TestInt>(
-        serial_readahead, [&sum](TestInt test_int) -> Status {
-          sum += test_int.value;
-          // Normally sleeping in a visit function would be a faux-pas but we want to slow
-          // the reader down to match the producer to maximize the stress
-          std::this_thread::sleep_for(kYieldDuration);
-          return Status::OK();
-        });
-    ASSERT_FINISHES_AND_RAISES(Invalid, visit_fut);
-    ASSERT_EQ(EXPECTED_SUM, sum);
-  }
-}
-
-TEST(TestAsyncUtil, Readahead) {
-  int num_delivered = 0;
-  auto source = [&num_delivered]() {
-    if (num_delivered < 5) {
-      return Future<TestInt>::MakeFinished(num_delivered++);
-    } else {
-      return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
-    }
-  };
-  auto readahead = MakeReadaheadGenerator<TestInt>(source, 10);
-  // Should not pump until first item requested
-  ASSERT_EQ(0, num_delivered);
-
-  auto first = readahead();
-  // At this point the pumping should have happened
-  ASSERT_EQ(5, num_delivered);
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto first_val, first);
-  ASSERT_EQ(TestInt(0), first_val);
-
-  // Read the rest
-  for (int i = 0; i < 4; i++) {
-    auto next = readahead();
-    ASSERT_FINISHES_OK_AND_ASSIGN(auto next_val, next);
-    ASSERT_EQ(TestInt(i + 1), next_val);
-  }
-
-  // Next should be end
-  auto last = readahead();
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto last_val, last);
-  ASSERT_EQ(IterationTraits<TestInt>::End(), last_val);
-}
-
-TEST(TestAsyncUtil, ReadaheadFailed) {
-  ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(4));
-  std::atomic<int32_t> counter(0);
-  // All tasks are a little slow.  The first task fails.
-  // The readahead will have spawned 9 more tasks and they
-  // should all pass
-  auto source = [thread_pool, &counter]() -> Future<TestInt> {
-    auto count = counter++;
-    return *thread_pool->Submit([count]() -> Result<TestInt> {
-      if (count == 0) {
-        return Status::Invalid("X");
-      }
-      return TestInt(count);
-    });
-  };
-  auto readahead = MakeReadaheadGenerator<TestInt>(source, 10);
-  ASSERT_FINISHES_AND_RAISES(Invalid, readahead());
-  SleepABit();
-
-  for (int i = 0; i < 9; i++) {
-    ASSERT_FINISHES_OK_AND_ASSIGN(auto next_val, readahead());
-    ASSERT_EQ(TestInt(i + 1), next_val);
-  }
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto after, readahead());
-
-  // It's possible that finished was set quickly and there
-  // are only 10 elements
-  if (after == IterationTraits<TestInt>::End()) {
-    return;
-  }
-
-  // It's also possible that finished was too slow and there
-  // ended up being 11 elements
-  ASSERT_EQ(TestInt(10), after);
-  // There can't be 12 elements because SleepABit will prevent it
-  ASSERT_FINISHES_OK_AND_ASSIGN(auto definitely_last, readahead());
-  ASSERT_EQ(IterationTraits<TestInt>::End(), definitely_last);
-}
-
-TEST(TestAsyncIteratorTransform, SkipSome) {
-  auto original = AsyncVectorIt({1, 2, 3});
-  auto filter = MakeFilter([](TestInt& t) { return t.value != 2; });
-  auto filtered = MakeAsyncGenerator(std::move(original), filter);
-  AssertAsyncGeneratorMatch({"1", "3"}, std::move(filtered));
-}
-
-TEST(PushGenerator, Empty) {
-  PushGenerator<TestInt> gen;
-  auto producer = gen.producer();
-
-  auto fut = gen();
-  AssertNotFinished(fut);
-  producer.Close();
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), fut);
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
-
-  // Close idempotent
-  fut = gen();
-  producer.Close();
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), fut);
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
-}
-
-TEST(PushGenerator, Success) {
-  PushGenerator<TestInt> gen;
-  auto producer = gen.producer();
-  std::vector<Future<TestInt>> futures;
-
-  producer.Push(TestInt{1});
-  producer.Push(TestInt{2});
-  for (int i = 0; i < 3; ++i) {
-    futures.push_back(gen());
-  }
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{1}, futures[0]);
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{2}, futures[1]);
-  AssertNotFinished(futures[2]);
-
-  producer.Push(TestInt{3});
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{3}, futures[2]);
-  producer.Push(TestInt{4});
-  futures.push_back(gen());
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{4}, futures[3]);
-  producer.Push(TestInt{5});
-  producer.Close();
-  for (int i = 0; i < 4; ++i) {
-    futures.push_back(gen());
-  }
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{5}, futures[4]);
-  for (int i = 5; i < 8; ++i) {
-    ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), futures[i]);
-  }
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
-}
-
-TEST(PushGenerator, Errors) {
-  PushGenerator<TestInt> gen;
-  auto producer = gen.producer();
-  std::vector<Future<TestInt>> futures;
-
-  producer.Push(TestInt{1});
-  producer.Push(Status::Invalid("2"));
-  for (int i = 0; i < 3; ++i) {
-    futures.push_back(gen());
-  }
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{1}, futures[0]);
-  ASSERT_FINISHES_AND_RAISES(Invalid, futures[1]);
-  AssertNotFinished(futures[2]);
-
-  producer.Push(Status::IOError("3"));
-  producer.Push(TestInt{4});
-  ASSERT_FINISHES_AND_RAISES(IOError, futures[2]);
-  futures.push_back(gen());
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{4}, futures[3]);
-  producer.Close();
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
-}
-
-TEST(PushGenerator, CloseEarly) {
-  PushGenerator<TestInt> gen;
-  auto producer = gen.producer();
-  std::vector<Future<TestInt>> futures;
-
-  producer.Push(TestInt{1});
-  producer.Push(TestInt{2});
-  for (int i = 0; i < 3; ++i) {
-    futures.push_back(gen());
-  }
-  producer.Close();
-  producer.Push(TestInt{3});
-
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{1}, futures[0]);
-  ASSERT_FINISHES_OK_AND_EQ(TestInt{2}, futures[1]);
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), futures[2]);
-  ASSERT_FINISHES_OK_AND_EQ(IterationTraits<TestInt>::End(), gen());
-}
-
-TEST(PushGenerator, Stress) {
-  const int NTHREADS = 20;
-  const int NVALUES = 2000;
-  const int NFUTURES = NVALUES + 100;
-
-  PushGenerator<TestInt> gen;
-  auto producer = gen.producer();
-
-  std::atomic<int> next_value{0};
-
-  auto producer_worker = [&]() {
-    while (true) {
-      int v = next_value.fetch_add(1);
-      if (v >= NVALUES) {
-        break;
-      }
-      producer.Push(v);
-    }
-  };
-
-  auto producer_main = [&]() {
-    std::vector<std::thread> threads;
-    for (int i = 0; i < NTHREADS; ++i) {
-      threads.emplace_back(producer_worker);
-    }
-    for (auto& thread : threads) {
-      thread.join();
-    }
-    producer.Close();
-  };
-
-  std::vector<Result<TestInt>> results;
-  std::thread thread(producer_main);
-  for (int i = 0; i < NFUTURES; ++i) {
-    results.push_back(gen().result());
-  }
-  thread.join();
-
-  std::unordered_set<int> seen_values;
-  for (int i = 0; i < NVALUES; ++i) {
-    ASSERT_OK_AND_ASSIGN(auto v, results[i]);
-    ASSERT_EQ(seen_values.count(v.value), 0);
-    seen_values.insert(v.value);
-  }
-  for (int i = NVALUES; i < NFUTURES; ++i) {
-    ASSERT_OK_AND_EQ(IterationTraits<TestInt>::End(), results[i]);
-  }
-}
-
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/stl_util_test.cc b/cpp/src/arrow/util/stl_util_test.cc
index 4746c6f3700..2a8784e13a8 100644
--- a/cpp/src/arrow/util/stl_util_test.cc
+++ b/cpp/src/arrow/util/stl_util_test.cc
@@ -21,6 +21,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "arrow/testing/gtest_util.h"
 #include "arrow/util/sort.h"
 #include "arrow/util/string.h"
 #include "arrow/util/vector.h"
@@ -92,5 +93,80 @@ TEST(StlUtilTest, ArgSortPermute) {
   ExpectSortPermutation({b, c, d, e, a, f}, {4, 0, 1, 2, 3, 5}, 2);
 }
 
+TEST(StlUtilTest, VectorFlatten) {
+  std::vector<int> a{1, 2, 3};
+  std::vector<int> b{4, 5, 6};
+  std::vector<int> c{7, 8, 9};
+  std::vector<std::vector<int>> vecs{a, b, c};
+  auto actual = FlattenVectors(vecs);
+  std::vector<int> expected{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  ASSERT_EQ(expected, actual);
+}
+
+static std::string int_to_str(int val) { return std::to_string(val); }
+
+TEST(StlUtilTest, VectorMap) {
+  std::vector<int> input{1, 2, 3};
+  std::vector<std::string> expected{"1", "2", "3"};
+
+  auto actual = MapVector(int_to_str, input);
+  ASSERT_EQ(expected, actual);
+
+  auto bind_fn = std::bind(int_to_str, std::placeholders::_1);
+  actual = MapVector(bind_fn, input);
+  ASSERT_EQ(expected, actual);
+
+  std::function<std::string(int)> std_fn = int_to_str;
+  actual = MapVector(std_fn, input);
+  ASSERT_EQ(expected, actual);
+
+  actual = MapVector([](int val) { return std::to_string(val); }, input);
+  ASSERT_EQ(expected, actual);
+}
+
+TEST(StlUtilTest, VectorMaybeMapFails) {
+  std::vector<int> input{1, 2, 3};
+  auto mapper = [](int item) -> Result<std::string> {
+    if (item == 1) {
+      return Status::Invalid("XYZ");
+    }
+    return std::to_string(item);
+  };
+  ASSERT_RAISES(Invalid, MaybeMapVector(mapper, input));
+}
+
+TEST(StlUtilTest, VectorMaybeMap) {
+  std::vector<int> input{1, 2, 3};
+  std::vector<std::string> expected{"1", "2", "3"};
+  EXPECT_OK_AND_ASSIGN(
+      auto actual,
+      MaybeMapVector([](int item) -> Result<std::string> { return std::to_string(item); },
+                     input));
+  ASSERT_EQ(expected, actual);
+}
+
+TEST(StlUtilTest, VectorUnwrapOrRaise) {
+  // TODO(ARROW-11998) There should be an easier way to construct these vectors
+  std::vector<Result<MoveOnlyDataType>> all_good;
+  all_good.push_back(Result<MoveOnlyDataType>(MoveOnlyDataType(1)));
+  all_good.push_back(Result<MoveOnlyDataType>(MoveOnlyDataType(2)));
+  all_good.push_back(Result<MoveOnlyDataType>(MoveOnlyDataType(3)));
+
+  std::vector<Result<MoveOnlyDataType>> some_bad;
+  some_bad.push_back(Result<MoveOnlyDataType>(MoveOnlyDataType(1)));
+  some_bad.push_back(Result<MoveOnlyDataType>(Status::Invalid("XYZ")));
+  some_bad.push_back(Result<MoveOnlyDataType>(Status::IOError("XYZ")));
+
+  EXPECT_OK_AND_ASSIGN(auto unwrapped, UnwrapOrRaise(std::move(all_good)));
+  std::vector<MoveOnlyDataType> expected;
+  expected.emplace_back(1);
+  expected.emplace_back(2);
+  expected.emplace_back(3);
+
+  ASSERT_EQ(expected, unwrapped);
+
+  ASSERT_RAISES(Invalid, UnwrapOrRaise(std::move(some_bad)));
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/task_group.h b/cpp/src/arrow/util/task_group.h
index 7a96bada013..3bb72f0d9cb 100644
--- a/cpp/src/arrow/util/task_group.h
+++ b/cpp/src/arrow/util/task_group.h
@@ -21,6 +21,7 @@
 #include <utility>
 
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/cancel.h"
 #include "arrow/util/functional.h"
 #include "arrow/util/macros.h"
diff --git a/cpp/src/arrow/util/test_common.cc b/cpp/src/arrow/util/test_common.cc
new file mode 100644
index 00000000000..ac187ba0ce0
--- /dev/null
+++ b/cpp/src/arrow/util/test_common.cc
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/test_common.h"
+
+namespace arrow {
+
+TestInt::TestInt() : value(-999) {}
+TestInt::TestInt(int i) : value(i) {}  // NOLINT runtime/explicit
+bool TestInt::operator==(const TestInt& other) const { return value == other.value; }
+
+std::ostream& operator<<(std::ostream& os, const TestInt& v) {
+  os << "{" << v.value << "}";
+  return os;
+}
+
+TestStr::TestStr() : value("") {}
+TestStr::TestStr(const std::string& s) : value(s) {}  // NOLINT runtime/explicit
+TestStr::TestStr(const char* s) : value(s) {}         // NOLINT runtime/explicit
+TestStr::TestStr(const TestInt& test_int) {
+  if (IsIterationEnd(test_int)) {
+    value = "";
+  } else {
+    value = std::to_string(test_int.value);
+  }
+}
+
+bool TestStr::operator==(const TestStr& other) const { return value == other.value; }
+
+std::ostream& operator<<(std::ostream& os, const TestStr& v) {
+  os << "{\"" << v.value << "\"}";
+  return os;
+}
+
+std::vector<TestInt> RangeVector(unsigned int max, unsigned int step) {
+  auto count = max / step;
+  std::vector<TestInt> range(count);
+  for (unsigned int i = 0; i < count; i++) {
+    range[i] = i * step;
+  }
+  return range;
+}
+
+Transformer<TestInt, TestStr> MakeFilter(std::function<bool(TestInt&)> filter) {
+  return [filter](TestInt next) -> Result<TransformFlow<TestStr>> {
+    if (filter(next)) {
+      return TransformYield(TestStr(next));
+    } else {
+      return TransformSkip();
+    }
+  };
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/test_common.h b/cpp/src/arrow/util/test_common.h
new file mode 100644
index 00000000000..9e811e0a7fb
--- /dev/null
+++ b/cpp/src/arrow/util/test_common.h
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iosfwd>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/iterator.h"
+
+namespace arrow {
+
+struct TestInt {
+  TestInt();
+  TestInt(int i);  // NOLINT runtime/explicit
+  int value;
+
+  bool operator==(const TestInt& other) const;
+
+  friend std::ostream& operator<<(std::ostream& os, const TestInt& v);
+};
+
+template <>
+struct IterationTraits<TestInt> {
+  static TestInt End() { return TestInt(); }
+  static bool IsEnd(const TestInt& val) { return val == IterationTraits<TestInt>::End(); }
+};
+
+struct TestStr {
+  TestStr();
+  TestStr(const std::string& s);  // NOLINT runtime/explicit
+  TestStr(const char* s);         // NOLINT runtime/explicit
+  explicit TestStr(const TestInt& test_int);
+  std::string value;
+
+  bool operator==(const TestStr& other) const;
+
+  friend std::ostream& operator<<(std::ostream& os, const TestStr& v);
+};
+
+template <>
+struct IterationTraits<TestStr> {
+  static TestStr End() { return TestStr(); }
+  static bool IsEnd(const TestStr& val) { return val == IterationTraits<TestStr>::End(); }
+};
+
+std::vector<TestInt> RangeVector(unsigned int max, unsigned int step = 1);
+
+template <typename T>
+inline Iterator<T> VectorIt(std::vector<T> v) {
+  return MakeVectorIterator<T>(std::move(v));
+}
+
+template <typename T>
+inline void AssertIteratorExhausted(Iterator<T>& it) {
+  ASSERT_OK_AND_ASSIGN(T next, it.Next());
+  ASSERT_TRUE(IsIterationEnd(next));
+}
+
+Transformer<TestInt, TestStr> MakeFilter(std::function<bool(TestInt&)> filter);
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/thread_pool.h b/cpp/src/arrow/util/thread_pool.h
index 6334b010c21..0abe381f100 100644
--- a/cpp/src/arrow/util/thread_pool.h
+++ b/cpp/src/arrow/util/thread_pool.h
@@ -104,15 +104,22 @@ class ARROW_EXPORT Executor {
   template <typename T>
   Future<T> Transfer(Future<T> future) {
     auto transferred = Future<T>::Make();
-    future.AddCallback([this, transferred](const Result<T>& result) mutable {
+    auto callback = [this, transferred](const Result<T>& result) mutable {
       auto spawn_status = Spawn([transferred, result]() mutable {
         transferred.MarkFinished(std::move(result));
       });
       if (!spawn_status.ok()) {
         transferred.MarkFinished(spawn_status);
       }
-    });
-    return transferred;
+    };
+    auto callback_factory = [&callback]() { return callback; };
+    if (future.TryAddCallback(callback_factory)) {
+      return transferred;
+    }
+    // If the future is already finished and we aren't going to force spawn a thread
+    // then we don't need to add another layer of callback and can return the original
+    // future
+    return future;
   }
 
   // Submit a callable and arguments for execution.  Return a future that
diff --git a/cpp/src/arrow/util/type_fwd.h b/cpp/src/arrow/util/type_fwd.h
index d29b130ebbd..f5d01518862 100644
--- a/cpp/src/arrow/util/type_fwd.h
+++ b/cpp/src/arrow/util/type_fwd.h
@@ -23,8 +23,6 @@ namespace detail {
 struct Empty;
 }  // namespace detail
 
-template <typename T = detail::Empty>
-class Future;
 template <typename T = detail::Empty>
 class WeakFuture;
 class FutureWaiter;
diff --git a/cpp/src/arrow/util/utf8_util_test.cc b/cpp/src/arrow/util/utf8_util_test.cc
index 25d5beb2ea5..8b6713623c4 100644
--- a/cpp/src/arrow/util/utf8_util_test.cc
+++ b/cpp/src/arrow/util/utf8_util_test.cc
@@ -65,7 +65,8 @@ class UTF8Test : public ::testing::Test {
   static std::vector<std::string> invalid_sequences_ascii;
 };
 
-std::vector<std::string> UTF8Test::valid_sequences_1 = {"a", "\x7f"};
+std::vector<std::string> UTF8Test::valid_sequences_1 = {"a", "\x7f",
+                                                        std::string("\0", 1)};
 std::vector<std::string> UTF8Test::valid_sequences_2 = {"\xc2\x80", "\xc3\xbf",
                                                         "\xdf\xbf"};
 std::vector<std::string> UTF8Test::valid_sequences_3 = {"\xe0\xa0\x80", "\xe8\x9d\xa5",
diff --git a/cpp/src/arrow/util/vector.h b/cpp/src/arrow/util/vector.h
index cbd874dacae..67401d496e6 100644
--- a/cpp/src/arrow/util/vector.h
+++ b/cpp/src/arrow/util/vector.h
@@ -21,6 +21,9 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/result.h"
+#include "arrow/util/algorithm.h"
+#include "arrow/util/functional.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -81,5 +84,54 @@ std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
   return values;
 }
 
+/// \brief Like MapVector, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+          typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& src) {
+  std::vector<To> out;
+  out.reserve(src.size());
+  ARROW_RETURN_NOT_OK(MaybeTransform(src.begin(), src.end(), std::back_inserter(out),
+                                     std::forward<Fn>(map)));
+  return out;
+}
+
+template <typename Fn, typename From,
+          typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
+  std::vector<To> out;
+  out.reserve(source.size());
+  std::transform(source.begin(), source.end(), std::back_inserter(out),
+                 std::forward<Fn>(map));
+  return out;
+}
+
+template <typename T>
+std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
+  std::size_t sum = 0;
+  for (const auto& vec : vecs) {
+    sum += vec.size();
+  }
+  std::vector<T> out;
+  out.reserve(sum);
+  for (const auto& vec : vecs) {
+    out.insert(out.end(), vec.begin(), vec.end());
+  }
+  return out;
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
+  std::vector<T> out;
+  out.reserve(results.size());
+  auto end = std::make_move_iterator(results.end());
+  for (auto it = std::make_move_iterator(results.begin()); it != end; it++) {
+    if (!it->ok()) {
+      return it->status();
+    }
+    out.push_back(it->MoveValueUnsafe());
+  }
+  return out;
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/gandiva/basic_decimal_scalar.h b/cpp/src/gandiva/basic_decimal_scalar.h
index 578fa232f87..b2f0da50655 100644
--- a/cpp/src/gandiva/basic_decimal_scalar.h
+++ b/cpp/src/gandiva/basic_decimal_scalar.h
@@ -18,7 +18,9 @@
 #pragma once
 
 #include <cstdint>
+
 #include "arrow/util/basic_decimal.h"
+#include "arrow/util/decimal.h"
 
 namespace gandiva {
 
diff --git a/cpp/src/gandiva/decimal_scalar.h b/cpp/src/gandiva/decimal_scalar.h
index af47b34c686..a03807b359a 100644
--- a/cpp/src/gandiva/decimal_scalar.h
+++ b/cpp/src/gandiva/decimal_scalar.h
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <string>
 #include "arrow/util/decimal.h"
+#include "arrow/util/hash_util.h"
 #include "gandiva/basic_decimal_scalar.h"
 
 namespace gandiva {
@@ -55,3 +56,21 @@ class DecimalScalar128 : public BasicDecimalScalar128 {
 };
 
 }  // namespace gandiva
+
+namespace std {
+template <>
+struct hash<gandiva::DecimalScalar128> {
+  std::size_t operator()(gandiva::DecimalScalar128 const& s) const noexcept {
+    arrow::BasicDecimal128 dvalue(s.value());
+
+    static const int kSeedValue = 4;
+    size_t result = kSeedValue;
+
+    arrow::internal::hash_combine(result, dvalue.high_bits());
+    arrow::internal::hash_combine(result, dvalue.low_bits());
+    arrow::internal::hash_combine(result, s.precision());
+    arrow::internal::hash_combine(result, s.scale());
+    return result;
+  }
+};
+}  // namespace std
diff --git a/cpp/src/gandiva/dex.h b/cpp/src/gandiva/dex.h
index 341f46884ea..3920f82f1d7 100644
--- a/cpp/src/gandiva/dex.h
+++ b/cpp/src/gandiva/dex.h
@@ -302,6 +302,37 @@ class InExprDexBase : public Dex {
   std::shared_ptr<InHolder<Type>> in_holder_;
 };
 
+template <>
+class InExprDexBase<gandiva::DecimalScalar128> : public Dex {
+ public:
+  InExprDexBase(const ValueValidityPairVector& args,
+                const std::unordered_set<gandiva::DecimalScalar128>& values,
+                int32_t precision, int32_t scale)
+      : args_(args), precision_(precision), scale_(scale) {
+    in_holder_.reset(new InHolder<gandiva::DecimalScalar128>(values));
+  }
+
+  int32_t get_precision() const { return precision_; }
+
+  int32_t get_scale() const { return scale_; }
+
+  const ValueValidityPairVector& args() const { return args_; }
+
+  void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
+
+  const std::string& runtime_function() const { return runtime_function_; }
+
+  const std::shared_ptr<InHolder<gandiva::DecimalScalar128>>& in_holder() const {
+    return in_holder_;
+  }
+
+ protected:
+  ValueValidityPairVector args_;
+  std::string runtime_function_;
+  std::shared_ptr<InHolder<gandiva::DecimalScalar128>> in_holder_;
+  int32_t precision_, scale_;
+};
+
 template <>
 class InExprDex<int32_t> : public InExprDexBase<int32_t> {
  public:
@@ -322,6 +353,18 @@ class InExprDex<int64_t> : public InExprDexBase<int64_t> {
   }
 };
 
+template <>
+class InExprDex<gandiva::DecimalScalar128>
+    : public InExprDexBase<gandiva::DecimalScalar128> {
+ public:
+  InExprDex(const ValueValidityPairVector& args,
+            const std::unordered_set<gandiva::DecimalScalar128>& values,
+            int32_t precision, int32_t scale)
+      : InExprDexBase<gandiva::DecimalScalar128>(args, values, precision, scale) {
+    runtime_function_ = "gdv_fn_in_expr_lookup_decimal";
+  }
+};
+
 template <>
 class InExprDex<std::string> : public InExprDexBase<std::string> {
  public:
diff --git a/cpp/src/gandiva/dex_visitor.h b/cpp/src/gandiva/dex_visitor.h
index 1a7812a1101..ba5de970dda 100644
--- a/cpp/src/gandiva/dex_visitor.h
+++ b/cpp/src/gandiva/dex_visitor.h
@@ -20,6 +20,7 @@
 #include <string>
 
 #include "arrow/util/logging.h"
+#include "gandiva/decimal_scalar.h"
 #include "gandiva/visibility.h"
 
 namespace gandiva {
@@ -60,6 +61,7 @@ class GANDIVA_EXPORT DexVisitor {
   virtual void Visit(const BooleanOrDex& dex) = 0;
   virtual void Visit(const InExprDexBase<int32_t>& dex) = 0;
   virtual void Visit(const InExprDexBase<int64_t>& dex) = 0;
+  virtual void Visit(const InExprDexBase<gandiva::DecimalScalar128>& dex) = 0;
   virtual void Visit(const InExprDexBase<std::string>& dex) = 0;
 };
 
@@ -83,6 +85,7 @@ class GANDIVA_EXPORT DexDefaultVisitor : public DexVisitor {
   VISIT_DCHECK(BooleanOrDex)
   VISIT_DCHECK(InExprDexBase<int32_t>)
   VISIT_DCHECK(InExprDexBase<int64_t>)
+  VISIT_DCHECK(InExprDexBase<gandiva::DecimalScalar128>)
   VISIT_DCHECK(InExprDexBase<std::string>)
 };
 
diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc
index 7065eab9b19..07252b42fd2 100644
--- a/cpp/src/gandiva/expr_decomposer.cc
+++ b/cpp/src/gandiva/expr_decomposer.cc
@@ -181,6 +181,18 @@ Status ExprDecomposer::Visit(const BooleanNode& node) {
   result_ = std::make_shared<ValueValidityPair>(validity_dex, value_dex);
   return Status::OK();
 }
+Status ExprDecomposer::Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) {
+  /* decompose the children. */
+  std::vector<ValueValidityPairPtr> args;
+  auto status = node.eval_expr()->Accept(*this);
+  ARROW_RETURN_NOT_OK(status);
+  args.push_back(result());
+  /* In always outputs valid results, so no validity dex */
+  auto value_dex = std::make_shared<InExprDex<gandiva::DecimalScalar128>>(
+      args, node.values(), node.get_precision(), node.get_scale());
+  result_ = std::make_shared<ValueValidityPair>(value_dex);
+  return Status::OK();
+}
 
 #define MAKE_VISIT_IN(ctype)                                                  \
   Status ExprDecomposer::Visit(const InExpressionNode<ctype>& node) {         \
diff --git a/cpp/src/gandiva/expr_decomposer.h b/cpp/src/gandiva/expr_decomposer.h
index 5de7de2aa9a..3e8e67de255 100644
--- a/cpp/src/gandiva/expr_decomposer.h
+++ b/cpp/src/gandiva/expr_decomposer.h
@@ -66,6 +66,7 @@ class GANDIVA_EXPORT ExprDecomposer : public NodeVisitor {
   Status Visit(const BooleanNode& node) override;
   Status Visit(const InExpressionNode<int32_t>& node) override;
   Status Visit(const InExpressionNode<int64_t>& node) override;
+  Status Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) override;
   Status Visit(const InExpressionNode<std::string>& node) override;
 
   // Optimize a function node, if possible.
diff --git a/cpp/src/gandiva/expr_validator.cc b/cpp/src/gandiva/expr_validator.cc
index b9a057346ed..fd46c2894b9 100644
--- a/cpp/src/gandiva/expr_validator.cc
+++ b/cpp/src/gandiva/expr_validator.cc
@@ -157,6 +157,11 @@ Status ExprValidator::Visit(const InExpressionNode<int64_t>& node) {
                               arrow::int64());
 }
 
+Status ExprValidator::Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) {
+  return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
+                              arrow::decimal(node.get_precision(), node.get_scale()));
+}
+
 Status ExprValidator::Visit(const InExpressionNode<std::string>& node) {
   return ValidateInExpression(node.values().size(), node.eval_expr()->return_type(),
                               arrow::utf8());
diff --git a/cpp/src/gandiva/expr_validator.h b/cpp/src/gandiva/expr_validator.h
index 68eb730d849..e25afe5e7e8 100644
--- a/cpp/src/gandiva/expr_validator.h
+++ b/cpp/src/gandiva/expr_validator.h
@@ -60,6 +60,7 @@ class ExprValidator : public NodeVisitor {
   Status Visit(const BooleanNode& node) override;
   Status Visit(const InExpressionNode<int32_t>& node) override;
   Status Visit(const InExpressionNode<int64_t>& node) override;
+  Status Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) override;
   Status Visit(const InExpressionNode<std::string>& node) override;
   Status ValidateInExpression(size_t number_of_values, DataTypePtr in_expr_return_type,
                               DataTypePtr type_of_values);
diff --git a/cpp/src/gandiva/function_registry_common.h b/cpp/src/gandiva/function_registry_common.h
index b51a40d58b8..d09ee23a3b6 100644
--- a/cpp/src/gandiva/function_registry_common.h
+++ b/cpp/src/gandiva/function_registry_common.h
@@ -165,6 +165,15 @@ typedef std::unordered_map<const FunctionSignature*, const NativeFunction*, KeyH
   NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
                  TYPE(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##TYPE))
 
+// Last day functions (used with data/time types) that :
+// - NULL handling is of type NULL_IF_NULL
+//
+// The pre-compiled fn name includes the base name & input type name. eg:
+// - last_day_from_date64
+#define LAST_DAY_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE)                           \
+  NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, \
+                 date64(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_from_##TYPE))
+
 // Hash32 functions that :
 // - NULL handling is of type NULL_NEVER
 //
diff --git a/cpp/src/gandiva/function_registry_datetime.cc b/cpp/src/gandiva/function_registry_datetime.cc
index 06889707bfc..8e89db8d664 100644
--- a/cpp/src/gandiva/function_registry_datetime.cc
+++ b/cpp/src/gandiva/function_registry_datetime.cc
@@ -82,7 +82,8 @@ std::vector<NativeFunction> GetDateTimeFunctionRegistry() {
 
       NativeFunction("extractDay", {}, DataTypeVector{day_time_interval()}, int64(),
                      kResultNullIfNull, "extractDay_daytimeinterval"),
-  };
+
+      DATE_TYPES(LAST_DAY_SAFE_NULL_IF_NULL, last_day, {})};
 
   return date_time_fn_registry_;
 }
diff --git a/cpp/src/gandiva/function_registry_math_ops.cc b/cpp/src/gandiva/function_registry_math_ops.cc
index 771e363e304..49afd40039b 100644
--- a/cpp/src/gandiva/function_registry_math_ops.cc
+++ b/cpp/src/gandiva/function_registry_math_ops.cc
@@ -36,6 +36,14 @@ namespace gandiva {
       BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, float32, float64), \
       BINARY_UNSAFE_NULL_IF_NULL(name, ALIASES, float64, float64)
 
+#define MATH_BINARY_SAFE(name, ALIASES)                                           \
+  BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int32, int32, float64),         \
+      BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, int64, int64, float64),     \
+      BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, uint32, uint32, float64),   \
+      BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, uint64, uint64, float64),   \
+      BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, float32, float32, float64), \
+      BINARY_GENERIC_SAFE_NULL_IF_NULL(name, ALIASES, float64, float64, float64)
+
 #define UNARY_SAFE_NULL_NEVER_BOOL_FN(name, ALIASES) \
   NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, name, ALIASES)
 
@@ -59,6 +67,13 @@ std::vector<NativeFunction> GetMathOpsFunctionRegistry() {
       BINARY_SAFE_NULL_NEVER_BOOL_FN(is_distinct_from, {}),
       BINARY_SAFE_NULL_NEVER_BOOL_FN(is_not_distinct_from, {}),
 
+      // trigonometry functions
+      MATH_UNARY_OPS(sin, {}), MATH_UNARY_OPS(cos, {}), MATH_UNARY_OPS(asin, {}),
+      MATH_UNARY_OPS(acos, {}), MATH_UNARY_OPS(tan, {}), MATH_UNARY_OPS(atan, {}),
+      MATH_UNARY_OPS(sinh, {}), MATH_UNARY_OPS(cosh, {}), MATH_UNARY_OPS(tanh, {}),
+      MATH_UNARY_OPS(cot, {}), MATH_UNARY_OPS(radians, {}), MATH_UNARY_OPS(degrees, {}),
+      MATH_BINARY_SAFE(atan2, {}),
+
       // decimal functions
       UNARY_SAFE_NULL_IF_NULL(abs, {}, decimal128, decimal128),
       UNARY_SAFE_NULL_IF_NULL(ceil, {}, decimal128, decimal128),
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
index 53253b02341..c313e16f67d 100644
--- a/cpp/src/gandiva/gdv_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -87,6 +87,17 @@ bool gdv_fn_in_expr_lookup_int64(int64_t ptr, int64_t value, bool in_validity) {
   return holder->HasValue(value);
 }
 
+bool gdv_fn_in_expr_lookup_decimal(int64_t ptr, int64_t value_high, int64_t value_low,
+                                   int32_t precision, int32_t scale, bool in_validity) {
+  if (!in_validity) {
+    return false;
+  }
+  gandiva::DecimalScalar128 value(value_high, value_low, precision, scale);
+  gandiva::InHolder<gandiva::DecimalScalar128>* holder =
+      reinterpret_cast<gandiva::InHolder<gandiva::DecimalScalar128>*>(ptr);
+  return holder->HasValue(value);
+}
+
 bool gdv_fn_in_expr_lookup_utf8(int64_t ptr, const char* data, int data_len,
                                 bool in_validity) {
   if (!in_validity) {
@@ -279,6 +290,18 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
                                   types->i1_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_in_expr_lookup_int64));
 
+  // gdv_fn_in_expr_lookup_decimal
+  args = {types->i64_type(),  // int64_t in holder ptr
+          types->i64_type(),  // high decimal value
+          types->i64_type(),  // low decimal value
+          types->i32_type(),  // decimal precision value
+          types->i32_type(),  // decimal scale value
+          types->i1_type()};  // bool in_validity
+
+  engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_decimal",
+                                  types->i1_type() /*return_type*/, args,
+                                  reinterpret_cast<void*>(gdv_fn_in_expr_lookup_decimal));
+
   // gdv_fn_in_expr_lookup_utf8
   args = {types->i64_type(),     // int64_t in holder ptr
           types->i8_ptr_type(),  // const char* value
diff --git a/cpp/src/gandiva/in_holder.h b/cpp/src/gandiva/in_holder.h
index 034745b387f..d55ab5ec55f 100644
--- a/cpp/src/gandiva/in_holder.h
+++ b/cpp/src/gandiva/in_holder.h
@@ -22,6 +22,7 @@
 
 #include "arrow/util/hashing.h"
 #include "gandiva/arrow.h"
+#include "gandiva/decimal_scalar.h"
 #include "gandiva/gandiva_aliases.h"
 
 namespace gandiva {
@@ -43,6 +44,24 @@ class InHolder {
   std::unordered_set<Type> values_;
 };
 
+template <>
+class InHolder<gandiva::DecimalScalar128> {
+ public:
+  explicit InHolder(const std::unordered_set<gandiva::DecimalScalar128>& values) {
+    values_.max_load_factor(0.25f);
+    for (auto& value : values) {
+      values_.insert(value);
+    }
+  }
+
+  bool HasValue(gandiva::DecimalScalar128 value) const {
+    return values_.count(value) == 1;
+  }
+
+ private:
+  std::unordered_set<gandiva::DecimalScalar128> values_;
+};
+
 template <>
 class InHolder<std::string> {
  public:
diff --git a/cpp/src/gandiva/jni/jni_common.cc b/cpp/src/gandiva/jni/jni_common.cc
index e09daf6a48f..871bd248e65 100644
--- a/cpp/src/gandiva/jni/jni_common.cc
+++ b/cpp/src/gandiva/jni/jni_common.cc
@@ -31,6 +31,7 @@
 
 #include "Types.pb.h"
 #include "gandiva/configuration.h"
+#include "gandiva/decimal_scalar.h"
 #include "gandiva/filter.h"
 #include "gandiva/jni/config_holder.h"
 #include "gandiva/jni/env_helper.h"
@@ -368,6 +369,17 @@ NodePtr ProtoTypeToInNode(const types::InNode& node) {
     return TreeExprBuilder::MakeInExpressionInt64(field, long_values);
   }
 
+  if (node.has_decimalvalues()) {
+    std::unordered_set<gandiva::DecimalScalar128> decimal_values;
+    for (int i = 0; i < node.decimalvalues().decimalvalues_size(); i++) {
+      decimal_values.insert(
+          gandiva::DecimalScalar128(node.decimalvalues().decimalvalues(i).value(),
+                                    node.decimalvalues().decimalvalues(i).precision(),
+                                    node.decimalvalues().decimalvalues(i).scale()));
+    }
+    return TreeExprBuilder::MakeInExpressionDecimal(field, decimal_values);
+  }
+
   if (node.has_stringvalues()) {
     std::unordered_set<std::string> stringvalues;
     for (int i = 0; i < node.stringvalues().stringvalues_size(); i++) {
diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc
index fbca25a66a6..1a80f1e7586 100644
--- a/cpp/src/gandiva/llvm_generator.cc
+++ b/cpp/src/gandiva/llvm_generator.cc
@@ -997,18 +997,6 @@ void LLVMGenerator::Visitor::Visit(const BooleanOrDex& dex) {
   result_.reset(new LValue(result_value));
 }
 
-void LLVMGenerator::Visitor::Visit(const InExprDexBase<int32_t>& dex) {
-  VisitInExpression<int32_t>(dex);
-}
-
-void LLVMGenerator::Visitor::Visit(const InExprDexBase<int64_t>& dex) {
-  VisitInExpression<int64_t>(dex);
-}
-
-void LLVMGenerator::Visitor::Visit(const InExprDexBase<std::string>& dex) {
-  VisitInExpression<std::string>(dex);
-}
-
 template <typename Type>
 void LLVMGenerator::Visitor::VisitInExpression(const InExprDexBase<Type>& dex) {
   ADD_VISITOR_TRACE("visit In Expression");
@@ -1040,11 +1028,70 @@ void LLVMGenerator::Visitor::VisitInExpression(const InExprDexBase<Type>& dex) {
 
   llvm::Type* ret_type = types->IRType(arrow::Type::type::BOOL);
 
-  llvm::Value* value =
-      generator_->AddFunctionCall(dex.runtime_function(), ret_type, params);
+  llvm::Value* value;
+
+  value = generator_->AddFunctionCall(dex.runtime_function(), ret_type, params);
+
+  result_.reset(new LValue(value));
+}
+
+template <>
+void LLVMGenerator::Visitor::VisitInExpression<gandiva::DecimalScalar128>(
+    const InExprDexBase<gandiva::DecimalScalar128>& dex) {
+  ADD_VISITOR_TRACE("visit In Expression");
+  LLVMTypes* types = generator_->types();
+  std::vector<llvm::Value*> params;
+  DecimalIR decimalIR(generator_->engine_.get());
+
+  const InExprDex<gandiva::DecimalScalar128>& dex_instance =
+      dynamic_cast<const InExprDex<gandiva::DecimalScalar128>&>(dex);
+  /* add the holder at the beginning */
+  llvm::Constant* ptr_int_cast =
+      types->i64_constant((int64_t)(dex_instance.in_holder().get()));
+  params.push_back(ptr_int_cast);
+
+  /* eval expr result */
+  for (auto& pair : dex.args()) {
+    DexPtr value_expr = pair->value_expr();
+    value_expr->Accept(*this);
+    LValue& result_ref = *result();
+    params.push_back(result_ref.data());
+
+    llvm::Constant* precision = types->i32_constant(dex.get_precision());
+    llvm::Constant* scale = types->i32_constant(dex.get_scale());
+    params.push_back(precision);
+    params.push_back(scale);
+
+    /* push the validity of eval expr result */
+    llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs());
+    params.push_back(validity_expr);
+  }
+
+  llvm::Type* ret_type = types->IRType(arrow::Type::type::BOOL);
+
+  llvm::Value* value;
+
+  value = decimalIR.CallDecimalFunction(dex.runtime_function(), ret_type, params);
+
   result_.reset(new LValue(value));
 }
 
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<int32_t>& dex) {
+  VisitInExpression<int32_t>(dex);
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<int64_t>& dex) {
+  VisitInExpression<int64_t>(dex);
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<gandiva::DecimalScalar128>& dex) {
+  VisitInExpression<gandiva::DecimalScalar128>(dex);
+}
+
+void LLVMGenerator::Visitor::Visit(const InExprDexBase<std::string>& dex) {
+  VisitInExpression<std::string>(dex);
+}
+
 LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition,
                                               std::function<LValuePtr()> then_func,
                                               std::function<LValuePtr()> else_func,
diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h
index 2d342ca1e1a..8ff9711c0f9 100644
--- a/cpp/src/gandiva/llvm_generator.h
+++ b/cpp/src/gandiva/llvm_generator.h
@@ -108,6 +108,7 @@ class GANDIVA_EXPORT LLVMGenerator {
     void Visit(const BooleanOrDex& dex) override;
     void Visit(const InExprDexBase<int32_t>& dex) override;
     void Visit(const InExprDexBase<int64_t>& dex) override;
+    void Visit(const InExprDexBase<gandiva::DecimalScalar128>& dex) override;
     void Visit(const InExprDexBase<std::string>& dex) override;
     template <typename Type>
     void VisitInExpression(const InExprDexBase<Type>& dex);
diff --git a/cpp/src/gandiva/node.h b/cpp/src/gandiva/node.h
index 961757dd181..20807d4a0cb 100644
--- a/cpp/src/gandiva/node.h
+++ b/cpp/src/gandiva/node.h
@@ -252,4 +252,48 @@ class InExpressionNode : public Node {
   std::unordered_set<Type> values_;
 };
 
+template <>
+class InExpressionNode<gandiva::DecimalScalar128> : public Node {
+ public:
+  InExpressionNode(NodePtr eval_expr,
+                   std::unordered_set<gandiva::DecimalScalar128>& values,
+                   int32_t precision, int32_t scale)
+      : Node(arrow::boolean()),
+        eval_expr_(std::move(eval_expr)),
+        values_(std::move(values)),
+        precision_(precision),
+        scale_(scale) {}
+
+  int32_t get_precision() const { return precision_; }
+
+  int32_t get_scale() const { return scale_; }
+
+  const NodePtr& eval_expr() const { return eval_expr_; }
+
+  const std::unordered_set<gandiva::DecimalScalar128>& values() const { return values_; }
+
+  Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << eval_expr_->ToString() << " IN (";
+    bool add_comma = false;
+    for (auto& value : values_) {
+      if (add_comma) {
+        ss << ", ";
+      }
+      // add type in the front to differentiate
+      ss << value;
+      add_comma = true;
+    }
+    ss << ")";
+    return ss.str();
+  }
+
+ private:
+  NodePtr eval_expr_;
+  std::unordered_set<gandiva::DecimalScalar128> values_;
+  int32_t precision_, scale_;
+};
+
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/node_visitor.h b/cpp/src/gandiva/node_visitor.h
index 7c0abd1e553..b118e496383 100644
--- a/cpp/src/gandiva/node_visitor.h
+++ b/cpp/src/gandiva/node_visitor.h
@@ -46,6 +46,7 @@ class GANDIVA_EXPORT NodeVisitor {
   virtual Status Visit(const BooleanNode& node) = 0;
   virtual Status Visit(const InExpressionNode<int32_t>& node) = 0;
   virtual Status Visit(const InExpressionNode<int64_t>& node) = 0;
+  virtual Status Visit(const InExpressionNode<gandiva::DecimalScalar128>& node) = 0;
   virtual Status Visit(const InExpressionNode<std::string>& node) = 0;
 };
 
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops.cc b/cpp/src/gandiva/precompiled/extended_math_ops.cc
index d7de4322613..b2d62daac7f 100644
--- a/cpp/src/gandiva/precompiled/extended_math_ops.cc
+++ b/cpp/src/gandiva/precompiled/extended_math_ops.cc
@@ -15,6 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
 #include "arrow/util/logging.h"
 #include "gandiva/precompiled/decimal_ops.h"
 
@@ -103,13 +107,117 @@ LOG_WITH_BASE(uint64, uint64, float64)
 LOG_WITH_BASE(float32, float32, float64)
 LOG_WITH_BASE(float64, float64, float64)
 
+// Sin
+#define SIN(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                             \
+  gdv_##OUT_TYPE sin_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(sin(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(SIN, float64)
+
+// Asin
+#define ASIN(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                              \
+  gdv_##OUT_TYPE asin_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(asin(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(ASIN, float64)
+
+// Cos
+#define COS(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                             \
+  gdv_##OUT_TYPE cos_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(cos(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(COS, float64)
+
+// Acos
+#define ACOS(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                              \
+  gdv_##OUT_TYPE acos_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(acos(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(ACOS, float64)
+
+// Tan
+#define TAN(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                             \
+  gdv_##OUT_TYPE tan_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(tan(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(TAN, float64)
+
+// Atan
+#define ATAN(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                              \
+  gdv_##OUT_TYPE atan_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(atan(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(ATAN, float64)
+
+// Sinh
+#define SINH(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                              \
+  gdv_##OUT_TYPE sinh_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(sinh(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(SINH, float64)
+
+// Cosh
+#define COSH(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                              \
+  gdv_##OUT_TYPE cosh_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(cosh(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(COSH, float64)
+
+// Tanh
+#define TANH(IN_TYPE, OUT_TYPE)                                             \
+  FORCE_INLINE                                                              \
+  gdv_##OUT_TYPE tanh_##IN_TYPE(gdv_##IN_TYPE in) {                         \
+    return static_cast<gdv_##OUT_TYPE>(tanh(static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(TANH, float64)
+
+// Atan2
+#define ATAN2(IN_TYPE, OUT_TYPE)                                                     \
+  FORCE_INLINE                                                                       \
+  gdv_##OUT_TYPE atan2_##IN_TYPE##_##IN_TYPE(gdv_##IN_TYPE in1, gdv_##IN_TYPE in2) { \
+    return static_cast<gdv_##OUT_TYPE>(                                              \
+        atan2(static_cast<long double>(in1), static_cast<long double>(in2)));        \
+  }
+ENUMERIC_TYPES_UNARY(ATAN2, float64)
+
+// Cot
+#define COT(IN_TYPE, OUT_TYPE)                                                        \
+  FORCE_INLINE                                                                        \
+  gdv_##OUT_TYPE cot_##IN_TYPE(gdv_##IN_TYPE in) {                                    \
+    return static_cast<gdv_##OUT_TYPE>(tan(M_PI / 2 - static_cast<long double>(in))); \
+  }
+ENUMERIC_TYPES_UNARY(COT, float64)
+
+// Radians
+#define RADIANS(IN_TYPE, OUT_TYPE)                                                   \
+  FORCE_INLINE                                                                       \
+  gdv_##OUT_TYPE radians_##IN_TYPE(gdv_##IN_TYPE in) {                               \
+    return static_cast<gdv_##OUT_TYPE>(static_cast<long double>(in) * M_PI / 180.0); \
+  }
+ENUMERIC_TYPES_UNARY(RADIANS, float64)
+
+// Degrees
+#define DEGREES(IN_TYPE, OUT_TYPE)                                                   \
+  FORCE_INLINE                                                                       \
+  gdv_##OUT_TYPE degrees_##IN_TYPE(gdv_##IN_TYPE in) {                               \
+    return static_cast<gdv_##OUT_TYPE>(static_cast<long double>(in) * 180.0 / M_PI); \
+  }
+ENUMERIC_TYPES_UNARY(DEGREES, float64)
+
 // power
 #define POWER(IN_TYPE1, IN_TYPE2, OUT_TYPE)                                              \
   FORCE_INLINE                                                                           \
   gdv_##OUT_TYPE power_##IN_TYPE1##_##IN_TYPE2(gdv_##IN_TYPE1 in1, gdv_##IN_TYPE2 in2) { \
     return static_cast<gdv_float64>(powl(in1, in2));                                     \
   }
-
 POWER(float64, float64, float64)
 
 FORCE_INLINE
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
index 81a3565b91f..6e59f684f62 100644
--- a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
@@ -15,6 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
 #include <gtest/gtest.h>
 #include <cmath>
 #include "gandiva/execution_context.h"
@@ -145,4 +149,128 @@ TEST(TestExtendedMathOps, TestTruncate) {
   EXPECT_EQ(truncate_int64_int32(8124674407369523212, -2), 8124674407369523200);
 }
 
+TEST(TestExtendedMathOps, TestTrigonometricFunctions) {
+  auto pi_float = static_cast<float>(M_PI);
+  // Sin functions
+  VerifyFuzzyEquals(sin_float32(0), sin(0));
+  VerifyFuzzyEquals(sin_float32(0), sin(0));
+  VerifyFuzzyEquals(sin_float32(pi_float / 2), sin(M_PI / 2));
+  VerifyFuzzyEquals(sin_float32(pi_float), sin(M_PI));
+  VerifyFuzzyEquals(sin_float32(-pi_float / 2), sin(-M_PI / 2));
+  VerifyFuzzyEquals(sin_float64(0), sin(0));
+  VerifyFuzzyEquals(sin_float64(M_PI / 2), sin(M_PI / 2));
+  VerifyFuzzyEquals(sin_float64(M_PI), sin(M_PI));
+  VerifyFuzzyEquals(sin_float64(-M_PI / 2), sin(-M_PI / 2));
+  VerifyFuzzyEquals(sin_int32(0), sin(0));
+  VerifyFuzzyEquals(sin_int64(0), sin(0));
+
+  // Cos functions
+  VerifyFuzzyEquals(cos_float32(0), cos(0));
+  VerifyFuzzyEquals(cos_float32(pi_float / 2), cos(M_PI / 2));
+  VerifyFuzzyEquals(cos_float32(pi_float), cos(M_PI));
+  VerifyFuzzyEquals(cos_float32(-pi_float / 2), cos(-M_PI / 2));
+  VerifyFuzzyEquals(cos_float64(0), cos(0));
+  VerifyFuzzyEquals(cos_float64(M_PI / 2), cos(M_PI / 2));
+  VerifyFuzzyEquals(cos_float64(M_PI), cos(M_PI));
+  VerifyFuzzyEquals(cos_float64(-M_PI / 2), cos(-M_PI / 2));
+  VerifyFuzzyEquals(cos_int32(0), cos(0));
+  VerifyFuzzyEquals(cos_int64(0), cos(0));
+
+  // Asin functions
+  VerifyFuzzyEquals(asin_float32(-1.0), asin(-1.0));
+  VerifyFuzzyEquals(asin_float32(1.0), asin(1.0));
+  VerifyFuzzyEquals(asin_float64(-1.0), asin(-1.0));
+  VerifyFuzzyEquals(asin_float64(1.0), asin(1.0));
+  VerifyFuzzyEquals(asin_int32(0), asin(0));
+  VerifyFuzzyEquals(asin_int64(0), asin(0));
+
+  // Acos functions
+  VerifyFuzzyEquals(acos_float32(-1.0), acos(-1.0));
+  VerifyFuzzyEquals(acos_float32(1.0), acos(1.0));
+  VerifyFuzzyEquals(acos_float64(-1.0), acos(-1.0));
+  VerifyFuzzyEquals(acos_float64(1.0), acos(1.0));
+  VerifyFuzzyEquals(acos_int32(0), acos(0));
+  VerifyFuzzyEquals(acos_int64(0), acos(0));
+
+  // Tan
+  VerifyFuzzyEquals(tan_float32(pi_float), tan(M_PI));
+  VerifyFuzzyEquals(tan_float32(-pi_float), tan(-M_PI));
+  VerifyFuzzyEquals(tan_float64(M_PI), tan(M_PI));
+  VerifyFuzzyEquals(tan_float64(-M_PI), tan(-M_PI));
+  VerifyFuzzyEquals(tan_int32(0), tan(0));
+  VerifyFuzzyEquals(tan_int64(0), tan(0));
+
+  // Atan
+  VerifyFuzzyEquals(atan_float32(pi_float), atan(M_PI));
+  VerifyFuzzyEquals(atan_float32(-pi_float), atan(-M_PI));
+  VerifyFuzzyEquals(atan_float64(M_PI), atan(M_PI));
+  VerifyFuzzyEquals(atan_float64(-M_PI), atan(-M_PI));
+  VerifyFuzzyEquals(atan_int32(0), atan(0));
+  VerifyFuzzyEquals(atan_int64(0), atan(0));
+
+  // Sinh functions
+  VerifyFuzzyEquals(sinh_float32(0), sinh(0));
+  VerifyFuzzyEquals(sinh_float32(pi_float / 2), sinh(M_PI / 2));
+  VerifyFuzzyEquals(sinh_float32(pi_float), sinh(M_PI));
+  VerifyFuzzyEquals(sinh_float32(-pi_float / 2), sinh(-M_PI / 2));
+  VerifyFuzzyEquals(sinh_float64(0), sinh(0));
+  VerifyFuzzyEquals(sinh_float64(M_PI / 2), sinh(M_PI / 2));
+  VerifyFuzzyEquals(sinh_float64(M_PI), sinh(M_PI));
+  VerifyFuzzyEquals(sinh_float64(-M_PI / 2), sinh(-M_PI / 2));
+  VerifyFuzzyEquals(sinh_int32(0), sinh(0));
+  VerifyFuzzyEquals(sinh_int64(0), sinh(0));
+
+  // Cosh functions
+  VerifyFuzzyEquals(cosh_float32(0), cosh(0));
+  VerifyFuzzyEquals(cosh_float32(pi_float / 2), cosh(M_PI / 2));
+  VerifyFuzzyEquals(cosh_float32(pi_float), cosh(M_PI));
+  VerifyFuzzyEquals(cosh_float32(-pi_float / 2), cosh(-M_PI / 2));
+  VerifyFuzzyEquals(cosh_float64(0), cosh(0));
+  VerifyFuzzyEquals(cosh_float64(M_PI / 2), cosh(M_PI / 2));
+  VerifyFuzzyEquals(cosh_float64(M_PI), cosh(M_PI));
+  VerifyFuzzyEquals(cosh_float64(-M_PI / 2), cosh(-M_PI / 2));
+  VerifyFuzzyEquals(cosh_int32(0), cosh(0));
+  VerifyFuzzyEquals(cosh_int64(0), cosh(0));
+
+  // Tanh
+  VerifyFuzzyEquals(tanh_float32(pi_float), tanh(M_PI));
+  VerifyFuzzyEquals(tanh_float32(-pi_float), tanh(-M_PI));
+  VerifyFuzzyEquals(tanh_float64(M_PI), tanh(M_PI));
+  VerifyFuzzyEquals(tanh_float64(-M_PI), tanh(-M_PI));
+  VerifyFuzzyEquals(tanh_int32(0), tanh(0));
+  VerifyFuzzyEquals(tanh_int64(0), tanh(0));
+
+  // Atan2
+  VerifyFuzzyEquals(atan2_float32_float32(1, 0), atan2(1, 0));
+  VerifyFuzzyEquals(atan2_float32_float32(-1.0, 0), atan2(-1, 0));
+  VerifyFuzzyEquals(atan2_float64_float64(1.0, 0.0), atan2(1, 0));
+  VerifyFuzzyEquals(atan2_float64_float64(-1, 0), atan2(-1, 0));
+  VerifyFuzzyEquals(atan2_int32_int32(1, 0), atan2(1, 0));
+  VerifyFuzzyEquals(atan2_int64_int64(-1, 0), atan2(-1, 0));
+
+  // Radians
+  VerifyFuzzyEquals(radians_float32(0), 0);
+  VerifyFuzzyEquals(radians_float32(180.0), M_PI);
+  VerifyFuzzyEquals(radians_float32(90.0), M_PI / 2);
+  VerifyFuzzyEquals(radians_float64(0), 0);
+  VerifyFuzzyEquals(radians_float64(180.0), M_PI);
+  VerifyFuzzyEquals(radians_float64(90.0), M_PI / 2);
+  VerifyFuzzyEquals(radians_int32(180), M_PI);
+  VerifyFuzzyEquals(radians_int64(90), M_PI / 2);
+
+  // Degrees
+  VerifyFuzzyEquals(degrees_float32(0), 0.0);
+  VerifyFuzzyEquals(degrees_float32(pi_float), 180.0);
+  VerifyFuzzyEquals(degrees_float32(pi_float / 2), 90.0);
+  VerifyFuzzyEquals(degrees_float64(0), 0.0);
+  VerifyFuzzyEquals(degrees_float64(M_PI), 180.0);
+  VerifyFuzzyEquals(degrees_float64(M_PI / 2), 90.0);
+  VerifyFuzzyEquals(degrees_int32(1), 57.2958);
+  VerifyFuzzyEquals(degrees_int64(1), 57.2958);
+
+  // Cot
+  VerifyFuzzyEquals(cot_float32(pi_float / 2), tan(M_PI / 2 - M_PI / 2));
+  VerifyFuzzyEquals(cot_float64(M_PI / 2), tan(M_PI / 2 - M_PI / 2));
+}
+
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/time.cc b/cpp/src/gandiva/precompiled/time.cc
index 8c1c96d5427..b3688868b81 100644
--- a/cpp/src/gandiva/precompiled/time.cc
+++ b/cpp/src/gandiva/precompiled/time.cc
@@ -451,6 +451,23 @@ EXTRACT_HOUR_TIME(time32)
 DATE_TRUNC_FUNCTIONS(date64)
 DATE_TRUNC_FUNCTIONS(timestamp)
 
+#define LAST_DAY_FUNC(TYPE)                                                   \
+  FORCE_INLINE                                                                \
+  gdv_date64 last_day_from_##TYPE(gdv_date64 millis) {                        \
+    EpochTimePoint received_day(millis);                                      \
+    const auto& day_without_hours_and_sec = received_day.ClearTimeOfDay();    \
+                                                                              \
+    int received_day_in_month = day_without_hours_and_sec.TmMday();           \
+    const auto& first_day_in_month =                                          \
+        day_without_hours_and_sec.AddDays(1 - received_day_in_month);         \
+                                                                              \
+    const auto& month_last_day = first_day_in_month.AddMonths(1).AddDays(-1); \
+                                                                              \
+    return month_last_day.MillisSinceEpoch();                                 \
+  }
+
+DATE_TYPES(LAST_DAY_FUNC)
+
 FORCE_INLINE
 gdv_date64 castDATE_int64(gdv_int64 in) { return in; }
 
diff --git a/cpp/src/gandiva/precompiled/time_test.cc b/cpp/src/gandiva/precompiled/time_test.cc
index 6e4ee8063d9..295c023877d 100644
--- a/cpp/src/gandiva/precompiled/time_test.cc
+++ b/cpp/src/gandiva/precompiled/time_test.cc
@@ -699,4 +699,29 @@ TEST(TestTime, TestCastTimestampToDate) {
   EXPECT_EQ(StringToTimestamp("2000-05-01 00:00:00"), out);
 }
 
+TEST(TestTime, TestLastDay) {
+  // leap year test
+  gdv_timestamp ts = StringToTimestamp("2016-02-11 03:20:34");
+  auto out = last_day_from_timestamp(ts);
+  EXPECT_EQ(StringToTimestamp("2016-02-29 00:00:00"), out);
+
+  ts = StringToTimestamp("2016-02-29 23:59:59");
+  out = last_day_from_timestamp(ts);
+  EXPECT_EQ(StringToTimestamp("2016-02-29 00:00:00"), out);
+
+  ts = StringToTimestamp("2016-01-30 23:59:00");
+  out = last_day_from_timestamp(ts);
+  EXPECT_EQ(StringToTimestamp("2016-01-31 00:00:00"), out);
+
+  // normal year
+  ts = StringToTimestamp("2017-02-03 23:59:59");
+  out = last_day_from_timestamp(ts);
+  EXPECT_EQ(StringToTimestamp("2017-02-28 00:00:00"), out);
+
+  // december
+  ts = StringToTimestamp("2015-12-03 03:12:59");
+  out = last_day_from_timestamp(ts);
+  EXPECT_EQ(StringToTimestamp("2015-12-31 00:00:00"), out);
+}
+
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 78ac9ec7c76..bc172088e0c 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -163,6 +163,57 @@ gdv_float64 log10_int64(gdv_int64);
 gdv_float64 log10_float32(gdv_float32);
 gdv_float64 log10_float64(gdv_float64);
 
+gdv_float64 sin_int32(gdv_int32);
+gdv_float64 sin_int64(gdv_int64);
+gdv_float64 sin_float32(gdv_float32);
+gdv_float64 sin_float64(gdv_float64);
+gdv_float64 cos_int32(gdv_int32);
+gdv_float64 cos_int64(gdv_int64);
+gdv_float64 cos_float32(gdv_float32);
+gdv_float64 cos_float64(gdv_float64);
+gdv_float64 asin_int32(gdv_int32);
+gdv_float64 asin_int64(gdv_int64);
+gdv_float64 asin_float32(gdv_float32);
+gdv_float64 asin_float64(gdv_float64);
+gdv_float64 acos_int32(gdv_int32);
+gdv_float64 acos_int64(gdv_int64);
+gdv_float64 acos_float32(gdv_float32);
+gdv_float64 acos_float64(gdv_float64);
+gdv_float64 tan_int32(gdv_int32);
+gdv_float64 tan_int64(gdv_int64);
+gdv_float64 tan_float32(gdv_float32);
+gdv_float64 tan_float64(gdv_float64);
+gdv_float64 atan_int32(gdv_int32);
+gdv_float64 atan_int64(gdv_int64);
+gdv_float64 atan_float32(gdv_float32);
+gdv_float64 atan_float64(gdv_float64);
+gdv_float64 sinh_int32(gdv_int32);
+gdv_float64 sinh_int64(gdv_int64);
+gdv_float64 sinh_float32(gdv_float32);
+gdv_float64 sinh_float64(gdv_float64);
+gdv_float64 cosh_int32(gdv_int32);
+gdv_float64 cosh_int64(gdv_int64);
+gdv_float64 cosh_float32(gdv_float32);
+gdv_float64 cosh_float64(gdv_float64);
+gdv_float64 tanh_int32(gdv_int32);
+gdv_float64 tanh_int64(gdv_int64);
+gdv_float64 tanh_float32(gdv_float32);
+gdv_float64 tanh_float64(gdv_float64);
+gdv_float64 atan2_int32_int32(gdv_int32 in1, gdv_int32 in2);
+gdv_float64 atan2_int64_int64(gdv_int64 in1, gdv_int64 in2);
+gdv_float64 atan2_float32_float32(gdv_float32 in1, gdv_float32 in2);
+gdv_float64 atan2_float64_float64(gdv_float64 in1, gdv_float64 in2);
+gdv_float64 cot_float32(gdv_float32);
+gdv_float64 cot_float64(gdv_float64);
+gdv_float64 radians_int32(gdv_int32);
+gdv_float64 radians_int64(gdv_int64);
+gdv_float64 radians_float32(gdv_float32);
+gdv_float64 radians_float64(gdv_float64);
+gdv_float64 degrees_int32(gdv_int32);
+gdv_float64 degrees_int64(gdv_int64);
+gdv_float64 degrees_float32(gdv_float32);
+gdv_float64 degrees_float64(gdv_float64);
+
 gdv_int32 bitwise_and_int32_int32(gdv_int32 in1, gdv_int32 in2);
 gdv_int64 bitwise_and_int64_int64(gdv_int64 in1, gdv_int64 in2);
 gdv_int32 bitwise_or_int32_int32(gdv_int32 in1, gdv_int32 in2);
@@ -201,6 +252,7 @@ gdv_timestamp castTIMESTAMP_date64(gdv_date64);
 gdv_timestamp castTIMESTAMP_int64(gdv_int64);
 gdv_date64 castDATE_timestamp(gdv_timestamp);
 const char* castVARCHAR_timestamp_int64(int64_t, gdv_timestamp, gdv_int64, gdv_int32*);
+gdv_date64 last_day_from_timestamp(gdv_date64 millis);
 
 gdv_int64 truncate_int64_int32(gdv_int64 in, gdv_int32 out_scale);
 
diff --git a/cpp/src/gandiva/proto/Types.proto b/cpp/src/gandiva/proto/Types.proto
index 9020ccdc5a0..7c0c49f2d85 100644
--- a/cpp/src/gandiva/proto/Types.proto
+++ b/cpp/src/gandiva/proto/Types.proto
@@ -221,6 +221,7 @@ message InNode {
   optional LongConstants longValues = 3;
   optional StringConstants stringValues = 4;
   optional BinaryConstants binaryValues = 5;
+  optional DecimalConstants decimalValues = 6;
 }
 
 message IntConstants {
@@ -231,6 +232,10 @@ message LongConstants {
   repeated LongNode longValues = 1;
 }
 
+message DecimalConstants {
+  repeated DecimalNode decimalValues = 1;
+}
+
 message StringConstants {
   repeated StringNode stringValues = 1;
 }
diff --git a/cpp/src/gandiva/tests/date_time_test.cc b/cpp/src/gandiva/tests/date_time_test.cc
index fdf2a72d1ab..53c3726c63d 100644
--- a/cpp/src/gandiva/tests/date_time_test.cc
+++ b/cpp/src/gandiva/tests/date_time_test.cc
@@ -537,4 +537,54 @@ TEST_F(TestProjector, TestMonthsBetween) {
   EXPECT_ARROW_ARRAY_EQUALS(exp_output, outputs.at(0));
 }
 
+TEST_F(TestProjector, TestLastDay) {
+  auto f0 = field("f0", arrow::date64());
+  auto schema = arrow::schema({f0});
+
+  // output fields
+  auto output = field("out", arrow::date64());
+
+  auto last_day_expr = TreeExprBuilder::MakeExpression("last_day", {f0}, output);
+
+  std::shared_ptr<Projector> projector;
+  auto status = Projector::Make(schema, {last_day_expr}, TestConfiguration(), &projector);
+  std::cout << status.message();
+  ASSERT_TRUE(status.ok());
+
+  time_t epoch = Epoch();
+
+  // Create a row-batch with some sample data
+  // Used a leap year as example.
+  int num_records = 5;
+  auto validity = {true, true, true, true, true};
+  std::vector<int64_t> f0_data = {MillisSince(epoch, 2016, 2, 3, 8, 20, 10, 34),
+                                  MillisSince(epoch, 2016, 2, 29, 23, 59, 59, 59),
+                                  MillisSince(epoch, 2016, 1, 30, 1, 15, 20, 0),
+                                  MillisSince(epoch, 2017, 2, 3, 23, 15, 20, 0),
+                                  MillisSince(epoch, 2015, 12, 30, 22, 50, 11, 0)};
+
+  auto array0 =
+      MakeArrowTypeArray<arrow::Date64Type, int64_t>(date64(), f0_data, validity);
+
+  std::vector<int64_t> f0_output_data = {MillisSince(epoch, 2016, 2, 29, 0, 0, 0, 0),
+                                         MillisSince(epoch, 2016, 2, 29, 0, 0, 0, 0),
+                                         MillisSince(epoch, 2016, 1, 31, 0, 0, 0, 0),
+                                         MillisSince(epoch, 2017, 2, 28, 0, 0, 0, 0),
+                                         MillisSince(epoch, 2015, 12, 31, 0, 0, 0, 0)};
+
+  // expected output
+  auto exp_output = MakeArrowArrayDate64(f0_output_data, validity);
+
+  // prepare input record batch
+  auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+  // Evaluate expression
+  arrow::ArrayVector outputs;
+  status = projector->Evaluate(*in_batch, pool_, &outputs);
+  EXPECT_TRUE(status.ok());
+
+  // Validate results
+  EXPECT_ARROW_ARRAY_EQUALS(exp_output, outputs.at(0));
+}
+
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/tests/in_expr_test.cc b/cpp/src/gandiva/tests/in_expr_test.cc
index 2ff91ae8f32..6a31b1cf4ef 100644
--- a/cpp/src/gandiva/tests/in_expr_test.cc
+++ b/cpp/src/gandiva/tests/in_expr_test.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <gtest/gtest.h>
+
 #include "arrow/memory_pool.h"
 #include "gandiva/filter.h"
 #include "gandiva/tests/test_util.h"
@@ -34,6 +35,20 @@ class TestIn : public ::testing::Test {
  protected:
   arrow::MemoryPool* pool_;
 };
+std::vector<Decimal128> MakeDecimalVector(std::vector<std::string> values) {
+  std::vector<arrow::Decimal128> ret;
+  for (auto str : values) {
+    Decimal128 decimal_value;
+    int32_t decimal_precision;
+    int32_t decimal_scale;
+
+    DCHECK_OK(
+        Decimal128::FromString(str, &decimal_value, &decimal_precision, &decimal_scale));
+
+    ret.push_back(decimal_value);
+  }
+  return ret;
+}
 
 TEST_F(TestIn, TestInSimple) {
   // schema for input fields
@@ -76,6 +91,52 @@ TEST_F(TestIn, TestInSimple) {
   EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
 }
 
+TEST_F(TestIn, TestInDecimal) {
+  int32_t precision = 38;
+  int32_t scale = 5;
+  auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+
+  // schema for input fields
+  auto field0 = field("f0", arrow::decimal(precision, scale));
+  auto schema = arrow::schema({field0});
+
+  // Build In f0 + f1 in (6, 11)
+  auto node_f0 = TreeExprBuilder::MakeField(field0);
+
+  gandiva::DecimalScalar128 d0("6", precision, scale);
+  gandiva::DecimalScalar128 d1("12", precision, scale);
+  gandiva::DecimalScalar128 d2("11", precision, scale);
+  std::unordered_set<gandiva::DecimalScalar128> in_constants({d0, d1, d2});
+  auto in_expr = TreeExprBuilder::MakeInExpressionDecimal(node_f0, in_constants);
+  auto condition = TreeExprBuilder::MakeCondition(in_expr);
+
+  std::shared_ptr<Filter> filter;
+  auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+  EXPECT_TRUE(status.ok());
+
+  // Create a row-batch with some sample data
+  int num_records = 5;
+  auto values0 = MakeDecimalVector({"1", "2", "0", "-6", "6"});
+  auto array0 =
+      MakeArrowArrayDecimal(decimal_type, values0, {true, true, true, false, true});
+  // expected output (indices for which condition matches)
+  auto exp = MakeArrowArrayUint16({4});
+
+  // prepare input record batch
+  auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+  std::shared_ptr<SelectionVector> selection_vector;
+  status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+  EXPECT_TRUE(status.ok());
+
+  // Evaluate expression
+  status = filter->Evaluate(*in_batch, selection_vector);
+  EXPECT_TRUE(status.ok());
+
+  // Validate results
+  EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
 TEST_F(TestIn, TestInString) {
   // schema for input fields
   auto field0 = field("f0", arrow::utf8());
diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc
index 02988b004c5..b63af40d359 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -15,6 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
 #include "gandiva/projector.h"
 
 #include <gtest/gtest.h>
@@ -356,6 +360,19 @@ TEST_F(TestProjector, TestExtendedMath) {
   auto field_log10 = arrow::field("log10", arrow::float64());
   auto field_logb = arrow::field("logb", arrow::float64());
   auto field_power = arrow::field("power", arrow::float64());
+  auto field_sin = arrow::field("sin", arrow::float64());
+  auto field_cos = arrow::field("cos", arrow::float64());
+  auto field_asin = arrow::field("asin", arrow::float64());
+  auto field_acos = arrow::field("acos", arrow::float64());
+  auto field_tan = arrow::field("tan", arrow::float64());
+  auto field_atan = arrow::field("atan", arrow::float64());
+  auto field_sinh = arrow::field("sinh", arrow::float64());
+  auto field_cosh = arrow::field("cosh", arrow::float64());
+  auto field_tanh = arrow::field("tanh", arrow::float64());
+  auto field_atan2 = arrow::field("atan2", arrow::float64());
+  auto field_cot = arrow::field("cot", arrow::float64());
+  auto field_radians = arrow::field("radians", arrow::float64());
+  auto field_degrees = arrow::field("degrees", arrow::float64());
 
   // Build expression
   auto cbrt_expr = TreeExprBuilder::MakeExpression("cbrt", {field0}, field_cbrt);
@@ -365,10 +382,27 @@ TEST_F(TestProjector, TestExtendedMath) {
   auto logb_expr = TreeExprBuilder::MakeExpression("log", {field0, field1}, field_logb);
   auto power_expr =
       TreeExprBuilder::MakeExpression("power", {field0, field1}, field_power);
+  auto sin_expr = TreeExprBuilder::MakeExpression("sin", {field0}, field_sin);
+  auto cos_expr = TreeExprBuilder::MakeExpression("cos", {field0}, field_cos);
+  auto asin_expr = TreeExprBuilder::MakeExpression("asin", {field0}, field_asin);
+  auto acos_expr = TreeExprBuilder::MakeExpression("acos", {field0}, field_acos);
+  auto tan_expr = TreeExprBuilder::MakeExpression("tan", {field0}, field_tan);
+  auto atan_expr = TreeExprBuilder::MakeExpression("atan", {field0}, field_atan);
+  auto sinh_expr = TreeExprBuilder::MakeExpression("sinh", {field0}, field_sinh);
+  auto cosh_expr = TreeExprBuilder::MakeExpression("cosh", {field0}, field_cosh);
+  auto tanh_expr = TreeExprBuilder::MakeExpression("tanh", {field0}, field_tanh);
+  auto atan2_expr =
+      TreeExprBuilder::MakeExpression("atan2", {field0, field1}, field_atan2);
+  auto cot_expr = TreeExprBuilder::MakeExpression("cot", {field0}, field_cot);
+  auto radians_expr = TreeExprBuilder::MakeExpression("radians", {field0}, field_radians);
+  auto degrees_expr = TreeExprBuilder::MakeExpression("degrees", {field0}, field_degrees);
 
   std::shared_ptr<Projector> projector;
   auto status = Projector::Make(
-      schema, {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, power_expr},
+      schema,
+      {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, power_expr, sin_expr,
+       cos_expr, asin_expr, acos_expr, tan_expr, atan_expr, sinh_expr, cosh_expr,
+       tanh_expr, atan2_expr, cot_expr, radians_expr, degrees_expr},
       TestConfiguration(), &projector);
   EXPECT_TRUE(status.ok());
 
@@ -388,6 +422,19 @@ TEST_F(TestProjector, TestExtendedMath) {
   std::vector<double> log10_vals;
   std::vector<double> logb_vals;
   std::vector<double> power_vals;
+  std::vector<double> sin_vals;
+  std::vector<double> cos_vals;
+  std::vector<double> asin_vals;
+  std::vector<double> acos_vals;
+  std::vector<double> tan_vals;
+  std::vector<double> atan_vals;
+  std::vector<double> sinh_vals;
+  std::vector<double> cosh_vals;
+  std::vector<double> tanh_vals;
+  std::vector<double> atan2_vals;
+  std::vector<double> cot_vals;
+  std::vector<double> radians_vals;
+  std::vector<double> degrees_vals;
   for (int i = 0; i < num_records; i++) {
     cbrt_vals.push_back(static_cast<double>(cbrtl(input0[i])));
     exp_vals.push_back(static_cast<double>(expl(input0[i])));
@@ -395,6 +442,19 @@ TEST_F(TestProjector, TestExtendedMath) {
     log10_vals.push_back(static_cast<double>(log10l(input0[i])));
     logb_vals.push_back(static_cast<double>(logl(input1[i]) / logl(input0[i])));
     power_vals.push_back(static_cast<double>(powl(input0[i], input1[i])));
+    sin_vals.push_back(static_cast<double>(sin(input0[i])));
+    cos_vals.push_back(static_cast<double>(cos(input0[i])));
+    asin_vals.push_back(static_cast<double>(asin(input0[i])));
+    acos_vals.push_back(static_cast<double>(acos(input0[i])));
+    tan_vals.push_back(static_cast<double>(tan(input0[i])));
+    atan_vals.push_back(static_cast<double>(atan(input0[i])));
+    sinh_vals.push_back(static_cast<double>(sinh(input0[i])));
+    cosh_vals.push_back(static_cast<double>(cosh(input0[i])));
+    tanh_vals.push_back(static_cast<double>(tanh(input0[i])));
+    atan2_vals.push_back(static_cast<double>(atan2(input0[i], input1[i])));
+    cot_vals.push_back(static_cast<double>(tan(M_PI / 2 - input0[i])));
+    radians_vals.push_back(static_cast<double>(input0[i] * M_PI / 180.0));
+    degrees_vals.push_back(static_cast<double>(input0[i] * 180.0 / M_PI));
   }
   auto expected_cbrt = MakeArrowArray<arrow::DoubleType, double>(cbrt_vals, validity);
   auto expected_exp = MakeArrowArray<arrow::DoubleType, double>(exp_vals, validity);
@@ -402,7 +462,21 @@ TEST_F(TestProjector, TestExtendedMath) {
   auto expected_log10 = MakeArrowArray<arrow::DoubleType, double>(log10_vals, validity);
   auto expected_logb = MakeArrowArray<arrow::DoubleType, double>(logb_vals, validity);
   auto expected_power = MakeArrowArray<arrow::DoubleType, double>(power_vals, validity);
-
+  auto expected_sin = MakeArrowArray<arrow::DoubleType, double>(sin_vals, validity);
+  auto expected_cos = MakeArrowArray<arrow::DoubleType, double>(cos_vals, validity);
+  auto expected_asin = MakeArrowArray<arrow::DoubleType, double>(asin_vals, validity);
+  auto expected_acos = MakeArrowArray<arrow::DoubleType, double>(acos_vals, validity);
+  auto expected_tan = MakeArrowArray<arrow::DoubleType, double>(tan_vals, validity);
+  auto expected_atan = MakeArrowArray<arrow::DoubleType, double>(atan_vals, validity);
+  auto expected_sinh = MakeArrowArray<arrow::DoubleType, double>(sinh_vals, validity);
+  auto expected_cosh = MakeArrowArray<arrow::DoubleType, double>(cosh_vals, validity);
+  auto expected_tanh = MakeArrowArray<arrow::DoubleType, double>(tanh_vals, validity);
+  auto expected_atan2 = MakeArrowArray<arrow::DoubleType, double>(atan2_vals, validity);
+  auto expected_cot = MakeArrowArray<arrow::DoubleType, double>(cot_vals, validity);
+  auto expected_radians =
+      MakeArrowArray<arrow::DoubleType, double>(radians_vals, validity);
+  auto expected_degrees =
+      MakeArrowArray<arrow::DoubleType, double>(degrees_vals, validity);
   // prepare input record batch
   auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
 
@@ -419,6 +493,19 @@ TEST_F(TestProjector, TestExtendedMath) {
   EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_log10, outputs.at(3), epsilon);
   EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_logb, outputs.at(4), epsilon);
   EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_power, outputs.at(5), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_sin, outputs.at(6), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_cos, outputs.at(7), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_asin, outputs.at(8), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_acos, outputs.at(9), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_tan, outputs.at(10), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_atan, outputs.at(11), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_sinh, outputs.at(12), 1E-08);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_cosh, outputs.at(13), 1E-08);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_tanh, outputs.at(14), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_atan2, outputs.at(15), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_cot, outputs.at(16), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_radians, outputs.at(17), epsilon);
+  EXPECT_ARROW_ARRAY_APPROX_EQUALS(expected_degrees, outputs.at(18), epsilon);
 }
 
 TEST_F(TestProjector, TestFloatLessThan) {
diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc
index ec160606c69..b27b92010e8 100644
--- a/cpp/src/gandiva/tree_expr_builder.cc
+++ b/cpp/src/gandiva/tree_expr_builder.cc
@@ -190,6 +190,18 @@ ConditionPtr TreeExprBuilder::MakeCondition(const std::string& function,
   return ConditionPtr(new Condition(func_node));
 }
 
+NodePtr TreeExprBuilder::MakeInExpressionDecimal(
+    NodePtr node, std::unordered_set<gandiva::DecimalScalar128>& constants) {
+  int32_t precision = 0;
+  int32_t scale = 0;
+  if (!constants.empty()) {
+    precision = constants.begin()->precision();
+    scale = constants.begin()->scale();
+  }
+  return std::make_shared<InExpressionNode<gandiva::DecimalScalar128>>(node, constants,
+                                                                       precision, scale);
+}
+
 #define MAKE_IN(NAME, ctype)                                        \
   NodePtr TreeExprBuilder::MakeInExpression##NAME(                  \
       NodePtr node, const std::unordered_set<ctype>& values) {      \
diff --git a/cpp/src/gandiva/tree_expr_builder.h b/cpp/src/gandiva/tree_expr_builder.h
index 247965a7187..9c24fb9d616 100644
--- a/cpp/src/gandiva/tree_expr_builder.h
+++ b/cpp/src/gandiva/tree_expr_builder.h
@@ -97,6 +97,9 @@ class GANDIVA_EXPORT TreeExprBuilder {
   static NodePtr MakeInExpressionInt64(NodePtr node,
                                        const std::unordered_set<int64_t>& constants);
 
+  static NodePtr MakeInExpressionDecimal(
+      NodePtr node, std::unordered_set<gandiva::DecimalScalar128>& constants);
+
   static NodePtr MakeInExpressionString(NodePtr node,
                                         const std::unordered_set<std::string>& constants);
 
diff --git a/cpp/src/parquet/.parquetcppversion b/cpp/src/parquet/.parquetcppversion
deleted file mode 100644
index f825f7c7f1b..00000000000
--- a/cpp/src/parquet/.parquetcppversion
+++ /dev/null
@@ -1 +0,0 @@
-1.5.1-SNAPSHOT
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 6653b7909db..3f3ca5a5299 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -21,24 +21,6 @@ add_custom_target(parquet-benchmarks)
 add_custom_target(parquet-tests)
 add_dependencies(parquet-all parquet parquet-tests parquet-benchmarks)
 
-file(READ "${CMAKE_CURRENT_SOURCE_DIR}/.parquetcppversion" PARQUET_VERSION)
-string(REPLACE "\n" "" PARQUET_VERSION "${PARQUET_VERSION}")
-string(REGEX MATCH "^([0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+)?)" VERSION ${PARQUET_VERSION})
-if(NOT VERSION)
-  message(FATAL_ERROR "invalid .parquetcppversion")
-endif()
-set(PARQUET_VERSION "${PARQUET_VERSION}" PARENT_SCOPE)
-
-string(REGEX
-       REPLACE "^([0-9]+)\.[0-9]+\.[0-9]+(-.+)?$" "\\1" PARQUET_VERSION_MAJOR
-               "${PARQUET_VERSION}")
-string(REGEX
-       REPLACE "^[0-9]+\.([0-9]+)\.[0-9]+(-.+)?$" "\\1" PARQUET_VERSION_MINOR
-               "${PARQUET_VERSION}")
-string(REGEX
-       REPLACE "^[0-9]+\.[0-9]+\.([0-9]+)(-.+)?$" "\\1" PARQUET_VERSION_PATCH
-               "${PARQUET_VERSION}")
-
 function(ADD_PARQUET_TEST REL_TEST_NAME)
   set(one_value_args)
   set(multi_value_args EXTRA_DEPENDENCIES LABELS)
@@ -174,7 +156,6 @@ set(PARQUET_SRCS
     column_reader.cc
     column_scanner.cc
     column_writer.cc
-    deprecated_io.cc
     encoding.cc
     encryption/encryption.cc
     encryption/internal_file_decryptor.cc
@@ -347,7 +328,6 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h"
 add_parquet_test(internals-test
                  SOURCES
                  bloom_filter_test.cc
-                 deprecated_io_test.cc
                  properties_test.cc
                  statistics_test.cc
                  encoding_test.cc
diff --git a/cpp/src/parquet/ParquetConfig.cmake.in b/cpp/src/parquet/ParquetConfig.cmake.in
index e4a00fc39c0..afdecc517c8 100644
--- a/cpp/src/parquet/ParquetConfig.cmake.in
+++ b/cpp/src/parquet/ParquetConfig.cmake.in
@@ -32,7 +32,7 @@
 include(CMakeFindDependencyMacro)
 find_dependency(Arrow)
 
-set(PARQUET_VERSION "@PARQUET_VERSION@")
+set(PARQUET_VERSION "@ARROW_VERSION@")
 set(PARQUET_SO_VERSION "@ARROW_SO_VERSION@")
 set(PARQUET_FULL_SO_VERSION "@ARROW_FULL_SO_VERSION@")
 
diff --git a/cpp/src/parquet/api/io.h b/cpp/src/parquet/api/io.h
index 4bc13bb2085..28a00f12a7a 100644
--- a/cpp/src/parquet/api/io.h
+++ b/cpp/src/parquet/api/io.h
@@ -17,5 +17,4 @@
 
 #pragma once
 
-#include "parquet/deprecated_io.h"
 #include "parquet/exception.h"
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 70966448744..446fe25e644 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -315,7 +315,7 @@ class SerializedPageWriter : public PageWriter {
     if (meta_encryptor_) {
       UpdateEncryption(encryption::kDictionaryPageHeader);
     }
-    int64_t header_size =
+    const int64_t header_size =
         thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
 
     PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
@@ -323,9 +323,7 @@ class SerializedPageWriter : public PageWriter {
     total_uncompressed_size_ += uncompressed_size + header_size;
     total_compressed_size_ += output_data_len + header_size;
     ++dict_encoding_stats_[page.encoding()];
-
-    PARQUET_ASSIGN_OR_THROW(int64_t final_pos, sink_->Tell());
-    return final_pos - start_pos;
+    return uncompressed_size + header_size;
   }
 
   void Close(bool has_dictionary, bool fallback) override {
@@ -363,7 +361,7 @@ class SerializedPageWriter : public PageWriter {
   }
 
   int64_t WriteDataPage(const DataPage& page) override {
-    int64_t uncompressed_size = page.uncompressed_size();
+    const int64_t uncompressed_size = page.uncompressed_size();
     std::shared_ptr<Buffer> compressed_data = page.buffer();
     const uint8_t* output_data_buffer = compressed_data->data();
     int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
@@ -400,7 +398,7 @@ class SerializedPageWriter : public PageWriter {
     if (meta_encryptor_) {
       UpdateEncryption(encryption::kDataPageHeader);
     }
-    int64_t header_size =
+    const int64_t header_size =
         thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
     PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
 
@@ -409,8 +407,7 @@ class SerializedPageWriter : public PageWriter {
     num_values_ += page.num_values();
     ++data_encoding_stats_[page.encoding()];
     ++page_ordinal_;
-    PARQUET_ASSIGN_OR_THROW(int64_t current_pos, sink_->Tell());
-    return current_pos - start_pos;
+    return uncompressed_size + header_size;
   }
 
   void SetDataPageHeader(format::PageHeader& page_header, const DataPageV1& page) {
@@ -753,7 +750,7 @@ class ColumnWriterImpl {
   // Total number of rows written with this ColumnWriter
   int rows_written_;
 
-  // Records the total number of bytes written by the serializer
+  // Records the total number of uncompressed bytes written by the serializer
   int64_t total_bytes_written_;
 
   // Records the current number of compressed bytes in a column
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index 57f98533a72..0a609021739 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -97,8 +97,10 @@ class PARQUET_EXPORT PageWriter {
   // page limit
   virtual void Close(bool has_dictionary, bool fallback) = 0;
 
+  // Return the number of uncompressed bytes written (including header size)
   virtual int64_t WriteDataPage(const DataPage& page) = 0;
 
+  // Return the number of uncompressed bytes written (including header size)
   virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
 
   virtual bool has_compressor() = 0;
diff --git a/cpp/src/parquet/deprecated_io.cc b/cpp/src/parquet/deprecated_io.cc
deleted file mode 100644
index dd10edc4c4c..00000000000
--- a/cpp/src/parquet/deprecated_io.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/deprecated_io.h"
-
-#include <cstdint>
-#include <utility>
-
-#include "arrow/result.h"
-
-#include "parquet/exception.h"
-
-namespace parquet {
-
-using ::arrow::Result;
-using ::arrow::Status;
-
-ParquetInputWrapper::ParquetInputWrapper(std::unique_ptr<RandomAccessSource> source)
-    : ParquetInputWrapper(source.get()) {
-  owned_source_ = std::move(source);
-}
-
-ParquetInputWrapper::ParquetInputWrapper(RandomAccessSource* source)
-    : source_(source), closed_(false) {}
-
-ParquetInputWrapper::~ParquetInputWrapper() {
-  if (!closed_) {
-    try {
-      source_->Close();
-    } catch (...) {
-    }
-    closed_ = true;
-  }
-}
-
-Status ParquetInputWrapper::Close() {
-  PARQUET_CATCH_NOT_OK(source_->Close());
-  closed_ = true;
-  return Status::OK();
-}
-
-Result<int64_t> ParquetInputWrapper::Tell() const {
-  PARQUET_CATCH_AND_RETURN(source_->Tell());
-}
-
-bool ParquetInputWrapper::closed() const { return closed_; }
-
-Status ParquetInputWrapper::Seek(int64_t position) {
-  return Status::NotImplemented("Seek");
-}
-
-Result<int64_t> ParquetInputWrapper::Read(int64_t nbytes, void* out) {
-  PARQUET_CATCH_AND_RETURN(source_->Read(nbytes, reinterpret_cast<uint8_t*>(out)));
-}
-
-Result<std::shared_ptr<Buffer>> ParquetInputWrapper::Read(int64_t nbytes) {
-  PARQUET_CATCH_AND_RETURN(source_->Read(nbytes));
-}
-
-Result<std::shared_ptr<Buffer>> ParquetInputWrapper::ReadAt(int64_t position,
-                                                            int64_t nbytes) {
-  PARQUET_CATCH_AND_RETURN(source_->ReadAt(position, nbytes));
-}
-
-Result<int64_t> ParquetInputWrapper::GetSize() {
-  PARQUET_CATCH_AND_RETURN(source_->Size());
-}
-
-ParquetOutputWrapper::ParquetOutputWrapper(std::unique_ptr<::parquet::OutputStream> sink)
-    : ParquetOutputWrapper(sink.get()) {
-  owned_sink_ = std::move(sink);
-}
-
-ParquetOutputWrapper::ParquetOutputWrapper(std::shared_ptr<::parquet::OutputStream> sink)
-    : ParquetOutputWrapper(sink.get()) {
-  shared_sink_ = std::move(sink);
-}
-
-ParquetOutputWrapper::ParquetOutputWrapper(::parquet::OutputStream* sink)
-    : sink_(sink), closed_(false) {}
-
-ParquetOutputWrapper::~ParquetOutputWrapper() {
-  if (!closed_) {
-    try {
-      sink_->Close();
-    } catch (...) {
-    }
-    closed_ = true;
-  }
-}
-
-Status ParquetOutputWrapper::Close() {
-  PARQUET_CATCH_NOT_OK(sink_->Close());
-  closed_ = true;
-  return Status::OK();
-}
-
-Result<int64_t> ParquetOutputWrapper::Tell() const {
-  PARQUET_CATCH_AND_RETURN(sink_->Tell());
-}
-
-bool ParquetOutputWrapper::closed() const { return closed_; }
-
-Status ParquetOutputWrapper::Write(const void* data, int64_t nbytes) {
-  PARQUET_CATCH_NOT_OK(sink_->Write(reinterpret_cast<const uint8_t*>(data), nbytes));
-  return Status::OK();
-}
-
-}  // namespace parquet
diff --git a/cpp/src/parquet/deprecated_io.h b/cpp/src/parquet/deprecated_io.h
deleted file mode 100644
index 618a3aeb26a..00000000000
--- a/cpp/src/parquet/deprecated_io.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// DEPRECATED IO INTERFACES: We have transitioned to using the Apache
-// Arrow file input and output abstract interfaces defined in
-// arrow/io/interfaces.h. These legacy interfaces are being preserved
-// through a wrapper layer for one to two releases
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "parquet/platform.h"
-
-namespace parquet {
-
-class PARQUET_EXPORT FileInterface {
- public:
-  virtual ~FileInterface() = default;
-
-  // Close the file
-  virtual void Close() = 0;
-
-  // Return the current position in the file relative to the start
-  virtual int64_t Tell() = 0;
-};
-
-/// It is the responsibility of implementations to mind threadsafety of shared
-/// resources
-class PARQUET_EXPORT RandomAccessSource : virtual public FileInterface {
- public:
-  virtual ~RandomAccessSource() = default;
-
-  virtual int64_t Size() const = 0;
-
-  // Returns bytes read
-  virtual int64_t Read(int64_t nbytes, uint8_t* out) = 0;
-
-  virtual std::shared_ptr<Buffer> Read(int64_t nbytes) = 0;
-
-  virtual std::shared_ptr<Buffer> ReadAt(int64_t position, int64_t nbytes) = 0;
-
-  /// Returns bytes read
-  virtual int64_t ReadAt(int64_t position, int64_t nbytes, uint8_t* out) = 0;
-};
-
-class PARQUET_EXPORT OutputStream : virtual public FileInterface {
- public:
-  virtual ~OutputStream() = default;
-
-  // Copy bytes into the output stream
-  virtual void Write(const uint8_t* data, int64_t length) = 0;
-};
-
-// ----------------------------------------------------------------------
-// Wrapper classes
-
-class PARQUET_EXPORT ParquetInputWrapper : public ::arrow::io::RandomAccessFile {
- public:
-  explicit ParquetInputWrapper(std::unique_ptr<RandomAccessSource> source);
-  explicit ParquetInputWrapper(RandomAccessSource* source);
-
-  ~ParquetInputWrapper() override;
-
-  // FileInterface
-  ::arrow::Status Close() override;
-  ::arrow::Result<int64_t> Tell() const override;
-  bool closed() const override;
-
-  // Seekable
-  ::arrow::Status Seek(int64_t position) override;
-
-  // InputStream / RandomAccessFile
-  ::arrow::Result<int64_t> Read(int64_t nbytes, void* out) override;
-  ::arrow::Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
-  ::arrow::Result<std::shared_ptr<Buffer>> ReadAt(int64_t position,
-                                                  int64_t nbytes) override;
-  ::arrow::Result<int64_t> GetSize() override;
-
- private:
-  std::unique_ptr<RandomAccessSource> owned_source_;
-  RandomAccessSource* source_;
-  bool closed_;
-};
-
-class PARQUET_EXPORT ParquetOutputWrapper : public ::arrow::io::OutputStream {
- public:
-  explicit ParquetOutputWrapper(std::shared_ptr<::parquet::OutputStream> sink);
-  explicit ParquetOutputWrapper(std::unique_ptr<::parquet::OutputStream> sink);
-  explicit ParquetOutputWrapper(::parquet::OutputStream* sink);
-
-  ~ParquetOutputWrapper() override;
-
-  // FileInterface
-  ::arrow::Status Close() override;
-  ::arrow::Result<int64_t> Tell() const override;
-  bool closed() const override;
-
-  // Writable
-  ::arrow::Status Write(const void* data, int64_t nbytes) override;
-
- private:
-  std::unique_ptr<::parquet::OutputStream> owned_sink_;
-  std::shared_ptr<::parquet::OutputStream> shared_sink_;
-  ::parquet::OutputStream* sink_;
-  bool closed_;
-};
-
-}  // namespace parquet
diff --git a/cpp/src/parquet/deprecated_io_test.cc b/cpp/src/parquet/deprecated_io_test.cc
deleted file mode 100644
index 8a02b78242f..00000000000
--- a/cpp/src/parquet/deprecated_io_test.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdio>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "arrow/testing/gtest_util.h"
-
-#include "parquet/deprecated_io.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/test_util.h"
-
-using arrow::default_memory_pool;
-using arrow::MemoryPool;
-
-namespace parquet {
-
-class MockRandomAccessSource : public RandomAccessSource {
- public:
-  MockRandomAccessSource(const uint8_t* data, int64_t size)
-      : data_(data), position_(0), size_(size) {}
-
-  int64_t Size() const override { return size_; }
-
-  int64_t Read(int64_t nbytes, uint8_t* out) override {
-    ThrowIfClosed();
-    int64_t bytes_to_read = std::min(nbytes, size_ - position_);
-    if (bytes_to_read == 0) {
-      return 0;
-    }
-    memcpy(out, data_ + position_, bytes_to_read);
-    position_ += bytes_to_read;
-    return bytes_to_read;
-  }
-
-  std::shared_ptr<Buffer> Read(int64_t nbytes) override {
-    ThrowIfClosed();
-    int64_t bytes_to_read = std::min(nbytes, size_ - position_);
-    std::shared_ptr<ResizableBuffer> out =
-        AllocateBuffer(::arrow::default_memory_pool(), bytes_to_read);
-    Read(bytes_to_read, out->mutable_data());
-    return std::move(out);
-  }
-
-  std::shared_ptr<Buffer> ReadAt(int64_t position, int64_t nbytes) override {
-    ThrowIfClosed();
-    position_ = position;
-    return Read(nbytes);
-  }
-
-  int64_t ReadAt(int64_t position, int64_t nbytes, uint8_t* out) override {
-    ThrowIfClosed();
-    position_ = position;
-    return Read(nbytes, out);
-  }
-
-  void Close() override { closed_ = true; }
-
-  int64_t Tell() override {
-    ThrowIfClosed();
-    return position_;
-  }
-
-  bool closed() const { return closed_; }
-
- private:
-  const uint8_t* data_;
-  int64_t position_;
-  int64_t size_;
-  bool closed_ = false;
-
-  void ThrowIfClosed() {
-    if (closed_) {
-      throw ParquetException("file is closed");
-    }
-  }
-};
-
-TEST(ParquetInputWrapper, BasicOperation) {
-  std::string data = "some example data";
-
-  auto source = std::unique_ptr<RandomAccessSource>(new MockRandomAccessSource(
-      reinterpret_cast<const uint8_t*>(data.data()), static_cast<int64_t>(data.size())));
-  ParquetInputWrapper wrapper(std::move(source));
-
-  ASSERT_FALSE(wrapper.closed());
-
-  ASSERT_OK_AND_EQ(0, wrapper.Tell());
-
-  // GetSize
-  ASSERT_OK_AND_EQ(static_cast<int64_t>(data.size()), wrapper.GetSize());
-
-  // Read into memory
-  uint8_t buf[4] = {0};
-  ASSERT_OK_AND_EQ(4, wrapper.Read(4, buf));
-  ASSERT_EQ(0, memcmp(buf, data.data(), 4));
-
-  ASSERT_OK_AND_EQ(4, wrapper.Tell());
-
-  // Seek
-  ASSERT_RAISES(NotImplemented, wrapper.Seek(5));
-
-  // Read buffer
-  ASSERT_OK_AND_ASSIGN(auto buffer, wrapper.Read(7));
-  ASSERT_EQ(0, memcmp(buffer->data(), data.data() + 4, 7));
-
-  // ReadAt
-  ASSERT_OK_AND_ASSIGN(buffer, wrapper.ReadAt(13, 4));
-  ASSERT_EQ(4, buffer->size());
-  ASSERT_EQ(0, memcmp(buffer->data(), data.data() + 13, 4));
-
-  // Close
-  ASSERT_OK(wrapper.Close());
-  ASSERT_TRUE(wrapper.closed());
-}
-
-class MockOutputStream : public OutputStream {
- public:
-  MockOutputStream() {}
-
-  void Write(const uint8_t* data, int64_t length) override {
-    ThrowIfClosed();
-    size_ += length;
-  }
-
-  void Close() override { closed_ = true; }
-
-  int64_t Tell() override {
-    ThrowIfClosed();
-    return size_;
-  }
-
-  bool closed() const { return closed_; }
-
- private:
-  int64_t size_ = 0;
-  bool closed_ = false;
-
-  void ThrowIfClosed() {
-    if (closed_) {
-      throw ParquetException("file is closed");
-    }
-  }
-};
-
-TEST(ParquetOutputWrapper, BasicOperation) {
-  auto stream = std::unique_ptr<OutputStream>(new MockOutputStream);
-  ParquetOutputWrapper wrapper(std::move(stream));
-
-  ASSERT_OK_AND_EQ(0, wrapper.Tell());
-
-  std::string data = "food";
-
-  ASSERT_OK(wrapper.Write(reinterpret_cast<const uint8_t*>(data.data()), 4));
-  ASSERT_OK_AND_EQ(4, wrapper.Tell());
-
-  // Close
-  ASSERT_OK(wrapper.Close());
-  ASSERT_TRUE(wrapper.closed());
-
-  // Test catch exceptions
-  ASSERT_RAISES(IOError, wrapper.Tell());
-  ASSERT_RAISES(IOError, wrapper.Write(reinterpret_cast<const uint8_t*>(data.data()), 4));
-}
-
-TEST(ParquetOutputWrapper, DtorCloses) {
-  MockOutputStream stream;
-  { ParquetOutputWrapper wrapper(&stream); }
-  ASSERT_TRUE(stream.closed());
-}
-
-}  // namespace parquet
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 2377d024276..7ac0c9d86a8 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -34,7 +34,6 @@
 #include "arrow/util/ubsan.h"
 #include "parquet/column_reader.h"
 #include "parquet/column_scanner.h"
-#include "parquet/deprecated_io.h"
 #include "parquet/encryption/encryption_internal.h"
 #include "parquet/encryption/internal_file_decryptor.h"
 #include "parquet/exception.h"
@@ -549,13 +548,6 @@ std::unique_ptr<ParquetFileReader> ParquetFileReader::Open(
   return result;
 }
 
-std::unique_ptr<ParquetFileReader> ParquetFileReader::Open(
-    std::unique_ptr<RandomAccessSource> source, const ReaderProperties& props,
-    std::shared_ptr<FileMetaData> metadata) {
-  auto wrapper = std::make_shared<ParquetInputWrapper>(std::move(source));
-  return Open(std::move(wrapper), props, std::move(metadata));
-}
-
 std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
     const std::string& path, bool memory_map, const ReaderProperties& props,
     std::shared_ptr<FileMetaData> metadata) {
diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h
index ccf5b79472a..de8685c7b90 100644
--- a/cpp/src/parquet/file_reader.h
+++ b/cpp/src/parquet/file_reader.h
@@ -33,7 +33,6 @@ namespace parquet {
 class ColumnReader;
 class FileMetaData;
 class PageReader;
-class RandomAccessSource;
 class RowGroupMetaData;
 
 class PARQUET_EXPORT RowGroupReader {
@@ -85,17 +84,6 @@ class PARQUET_EXPORT ParquetFileReader {
   ParquetFileReader();
   ~ParquetFileReader();
 
-  // Create a reader from some implementation of parquet-cpp's generic file
-  // input interface
-  //
-  // If you cannot provide exclusive access to your file resource, create a
-  // subclass of RandomAccessSource that wraps the shared resource
-  ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version")
-  static std::unique_ptr<ParquetFileReader> Open(
-      std::unique_ptr<RandomAccessSource> source,
-      const ReaderProperties& props = default_reader_properties(),
-      std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
   // Create a file reader instance from an Arrow file object. Thread-safety is
   // the responsibility of the file implementation
   static std::unique_ptr<ParquetFileReader> Open(
diff --git a/cpp/src/parquet/file_serialize_test.cc b/cpp/src/parquet/file_serialize_test.cc
index 90e074f1153..3574d379561 100644
--- a/cpp/src/parquet/file_serialize_test.cc
+++ b/cpp/src/parquet/file_serialize_test.cc
@@ -123,16 +123,30 @@ class TestSerialize : public PrimitiveTypedTest<TestType> {
 
     for (int rg = 0; rg < num_rowgroups_; ++rg) {
       auto rg_reader = file_reader->RowGroup(rg);
-      ASSERT_EQ(num_columns_, rg_reader->metadata()->num_columns());
-      ASSERT_EQ(rows_per_rowgroup_, rg_reader->metadata()->num_rows());
+      auto rg_metadata = rg_reader->metadata();
+      ASSERT_EQ(num_columns_, rg_metadata->num_columns());
+      ASSERT_EQ(rows_per_rowgroup_, rg_metadata->num_rows());
       // Check that the specified compression was actually used.
-      ASSERT_EQ(expected_codec_type,
-                rg_reader->metadata()->ColumnChunk(0)->compression());
+      ASSERT_EQ(expected_codec_type, rg_metadata->ColumnChunk(0)->compression());
+
+      const int64_t total_byte_size = rg_metadata->total_byte_size();
+      const int64_t total_compressed_size = rg_metadata->total_compressed_size();
+      if (expected_codec_type == Compression::UNCOMPRESSED) {
+        ASSERT_EQ(total_byte_size, total_compressed_size);
+      } else {
+        ASSERT_NE(total_byte_size, total_compressed_size);
+      }
 
-      int64_t values_read;
+      int64_t total_column_byte_size = 0;
+      int64_t total_column_compressed_size = 0;
 
       for (int i = 0; i < num_columns_; ++i) {
-        ASSERT_FALSE(rg_reader->metadata()->ColumnChunk(i)->has_index_page());
+        int64_t values_read;
+        ASSERT_FALSE(rg_metadata->ColumnChunk(i)->has_index_page());
+        total_column_byte_size += rg_metadata->ColumnChunk(i)->total_uncompressed_size();
+        total_column_compressed_size +=
+            rg_metadata->ColumnChunk(i)->total_compressed_size();
+
         std::vector<int16_t> def_levels_out(rows_per_rowgroup_);
         std::vector<int16_t> rep_levels_out(rows_per_rowgroup_);
         auto col_reader =
@@ -145,6 +159,9 @@ class TestSerialize : public PrimitiveTypedTest<TestType> {
         ASSERT_EQ(this->values_, this->values_out_);
         ASSERT_EQ(this->def_levels_, def_levels_out);
       }
+
+      ASSERT_EQ(total_byte_size, total_column_byte_size);
+      ASSERT_EQ(total_compressed_size, total_column_compressed_size);
     }
   }
 
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index 23dcd5205d3..deac9586e5a 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -24,7 +24,6 @@
 #include <vector>
 
 #include "parquet/column_writer.h"
-#include "parquet/deprecated_io.h"
 #include "parquet/encryption/encryption_internal.h"
 #include "parquet/encryption/internal_file_encryptor.h"
 #include "parquet/exception.h"
@@ -453,14 +452,6 @@ std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
   return result;
 }
 
-std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
-    std::shared_ptr<OutputStream> sink, std::shared_ptr<schema::GroupNode> schema,
-    std::shared_ptr<WriterProperties> properties,
-    std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
-  return Open(std::make_shared<ParquetOutputWrapper>(std::move(sink)), std::move(schema),
-              std::move(properties), std::move(key_value_metadata));
-}
-
 void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
   // Write MetaData
   PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
@@ -499,30 +490,11 @@ void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
   }
 }
 
-void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink,
-                       const std::shared_ptr<Encryptor>& encryptor, bool encrypt_footer) {
-  ParquetOutputWrapper wrapper(sink);
-  return WriteFileMetaData(file_metadata, &wrapper);
-}
-
-void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, OutputStream* sink,
-                                const std::shared_ptr<Encryptor>& encryptor,
-                                bool encrypt_footer) {
-  ParquetOutputWrapper wrapper(sink);
-  return WriteEncryptedFileMetadata(file_metadata, &wrapper, encryptor, encrypt_footer);
-}
-
 void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
                              ArrowOutputStream* sink) {
   crypto_metadata.WriteTo(sink);
 }
 
-void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
-                             OutputStream* sink) {
-  ParquetOutputWrapper wrapper(sink);
-  crypto_metadata.WriteTo(&wrapper);
-}
-
 const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); }
 
 const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h
index 8caa5efbab5..4cfc24719a3 100644
--- a/cpp/src/parquet/file_writer.h
+++ b/cpp/src/parquet/file_writer.h
@@ -29,7 +29,6 @@
 namespace parquet {
 
 class ColumnWriter;
-class OutputStream;
 
 // FIXME: copied from reader-internal.cc
 static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
@@ -97,10 +96,6 @@ class PARQUET_EXPORT RowGroupWriter {
   std::unique_ptr<Contents> contents_;
 };
 
-ARROW_DEPRECATED("Use version with arrow::io::OutputStream*")
-PARQUET_EXPORT
-void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink);
-
 PARQUET_EXPORT
 void WriteFileMetaData(const FileMetaData& file_metadata,
                        ::arrow::io::OutputStream* sink);
@@ -115,8 +110,6 @@ void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
                                 const std::shared_ptr<Encryptor>& encryptor,
                                 bool encrypt_footer);
 
-void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
-                             OutputStream* sink);
 PARQUET_EXPORT
 void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
                                 ::arrow::io::OutputStream* sink,
@@ -178,12 +171,6 @@ class PARQUET_EXPORT ParquetFileWriter {
       std::shared_ptr<WriterProperties> properties = default_writer_properties(),
       std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
 
-  ARROW_DEPRECATED("Use version with arrow::io::OutputStream")
-  static std::unique_ptr<ParquetFileWriter> Open(
-      std::shared_ptr<OutputStream> sink, std::shared_ptr<schema::GroupNode> schema,
-      std::shared_ptr<WriterProperties> properties = default_writer_properties(),
-      std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
-
   void Open(std::unique_ptr<Contents> contents);
   void Close();
 
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index b4996f2f953..bd9bf77c42d 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -414,12 +414,12 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
 
   inline int64_t total_byte_size() const { return row_group_->total_byte_size; }
 
-  inline int64_t file_offset() const { return row_group_->file_offset; }
-
   inline int64_t total_compressed_size() const {
     return row_group_->total_compressed_size;
   }
 
+  inline int64_t file_offset() const { return row_group_->file_offset; }
+
   inline const SchemaDescriptor* schema() const { return schema_; }
 
   std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
@@ -466,6 +466,10 @@ int64_t RowGroupMetaData::num_rows() const { return impl_->num_rows(); }
 
 int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_size(); }
 
+int64_t RowGroupMetaData::total_compressed_size() const {
+  return impl_->total_compressed_size();
+}
+
 int64_t RowGroupMetaData::file_offset() const { return impl_->file_offset(); }
 
 const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); }
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index b6e85801372..1865115e423 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -205,6 +205,12 @@ class PARQUET_EXPORT RowGroupMetaData {
   /// \brief Total byte size of all the uncompressed column data in this row group.
   int64_t total_byte_size() const;
 
+  /// \brief Total byte size of all the compressed (and potentially encrypted)
+  /// column data in this row group.
+  ///
+  /// This information is optional and may be 0 if omitted.
+  int64_t total_compressed_size() const;
+
   /// \brief Byte offset from beginning of file to first page (data or
   /// dictionary) in this row group
   ///
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
index ce707a65be7..e744f5cfc12 100644
--- a/cpp/src/parquet/metadata_test.cc
+++ b/cpp/src/parquet/metadata_test.cc
@@ -135,6 +135,7 @@ TEST(Metadata, TestBuildAccess) {
     ASSERT_EQ(2, rg1_accessor->num_columns());
     ASSERT_EQ(nrows / 2, rg1_accessor->num_rows());
     ASSERT_EQ(1024, rg1_accessor->total_byte_size());
+    ASSERT_EQ(1024, rg1_accessor->total_compressed_size());
 
     auto rg1_column1 = rg1_accessor->ColumnChunk(0);
     auto rg1_column2 = rg1_accessor->ColumnChunk(1);
@@ -169,6 +170,7 @@ TEST(Metadata, TestBuildAccess) {
     ASSERT_EQ(2, rg2_accessor->num_columns());
     ASSERT_EQ(nrows / 2, rg2_accessor->num_rows());
     ASSERT_EQ(1024, rg2_accessor->total_byte_size());
+    ASSERT_EQ(1024, rg2_accessor->total_compressed_size());
 
     auto rg2_column1 = rg2_accessor->ColumnChunk(0);
     auto rg2_column2 = rg2_accessor->ColumnChunk(1);
diff --git a/cpp/src/parquet/parquet.pc.in b/cpp/src/parquet/parquet.pc.in
index 178f4b1378e..3b29263a9d6 100644
--- a/cpp/src/parquet/parquet.pc.in
+++ b/cpp/src/parquet/parquet.pc.in
@@ -25,7 +25,7 @@ full_so_version=@ARROW_FULL_SO_VERSION@
 
 Name: Apache Parquet
 Description: Apache Parquet is a columnar storage format.
-Version: @PARQUET_VERSION@
+Version: @ARROW_VERSION@
 Requires: arrow
 Libs: -L${libdir} -lparquet
 Cflags: -I${includedir}
diff --git a/cpp/src/parquet/parquet_version.h.in b/cpp/src/parquet/parquet_version.h.in
index 2b207a45d8b..b7d9576c459 100644
--- a/cpp/src/parquet/parquet_version.h.in
+++ b/cpp/src/parquet/parquet_version.h.in
@@ -18,14 +18,14 @@
 #ifndef PARQUET_VERSION_H
 #define PARQUET_VERSION_H
 
-#define PARQUET_VERSION_MAJOR @PARQUET_VERSION_MAJOR@
-#define PARQUET_VERSION_MINOR @PARQUET_VERSION_MINOR@
-#define PARQUET_VERSION_PATCH @PARQUET_VERSION_PATCH@
+#define PARQUET_VERSION_MAJOR @ARROW_VERSION_MAJOR@
+#define PARQUET_VERSION_MINOR @ARROW_VERSION_MINOR@
+#define PARQUET_VERSION_PATCH @ARROW_VERSION_PATCH@
 
 #define PARQUET_SO_VERSION "@ARROW_SO_VERSION@"
 #define PARQUET_FULL_SO_VERSION "@ARROW_FULL_SO_VERSION@"
 
 // define the parquet created by version
-#define CREATED_BY_VERSION "parquet-cpp version @PARQUET_VERSION@"
+#define CREATED_BY_VERSION "parquet-cpp-arrow version @ARROW_VERSION@"
 
 #endif  // PARQUET_VERSION_H
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 228a7f29fcc..dfd4bd802ee 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -87,11 +87,15 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
     const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
     stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
            << TypeToString(descr->physical_type());
-    if (descr->converted_type() != ConvertedType::NONE) {
-      stream << "/" << ConvertedTypeToString(descr->converted_type());
+    const auto& logical_type = descr->logical_type();
+    if (!logical_type->is_none()) {
+      stream << " / " << logical_type->ToString();
     }
-    if (descr->converted_type() == ConvertedType::DECIMAL) {
-      stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+    if (descr->converted_type() != ConvertedType::NONE) {
+      stream << " / " << ConvertedTypeToString(descr->converted_type());
+      if (descr->converted_type() == ConvertedType::DECIMAL) {
+        stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+      }
     }
     stream << ")" << std::endl;
   }
@@ -103,6 +107,8 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
 
     stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
+    stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
+           << " ---\n";
     stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
 
     // Print column metadata
@@ -235,6 +241,8 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
 
     stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
+    stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
+           << "\", ";
     stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
 
     // Print column metadata
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 78aa0693cb9..7c2f9d7aa58 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -90,6 +90,29 @@ void AssertColumnValues(std::shared_ptr<TypedColumnReader<DType>> col, int64_t b
   ASSERT_EQ(expected_values_read, values_read);
 }
 
+void CheckRowGroupMetadata(const RowGroupMetaData* rg_metadata,
+                           bool allow_uncompressed_mismatch = false) {
+  const int64_t total_byte_size = rg_metadata->total_byte_size();
+  const int64_t total_compressed_size = rg_metadata->total_compressed_size();
+
+  ASSERT_GE(total_byte_size, 0);
+  ASSERT_GE(total_compressed_size, 0);
+
+  int64_t total_column_byte_size = 0;
+  int64_t total_column_compressed_size = 0;
+  for (int i = 0; i < rg_metadata->num_columns(); ++i) {
+    total_column_byte_size += rg_metadata->ColumnChunk(i)->total_uncompressed_size();
+    total_column_compressed_size += rg_metadata->ColumnChunk(i)->total_compressed_size();
+  }
+
+  if (!allow_uncompressed_mismatch) {
+    ASSERT_EQ(total_byte_size, total_column_byte_size);
+  }
+  if (total_compressed_size != 0) {
+    ASSERT_EQ(total_compressed_size, total_column_compressed_size);
+  }
+}
+
 class TestAllTypesPlain : public ::testing::Test {
  public:
   void SetUp() { reader_ = ParquetFileReader::OpenFile(alltypes_plain()); }
@@ -102,6 +125,11 @@ class TestAllTypesPlain : public ::testing::Test {
 
 TEST_F(TestAllTypesPlain, NoopConstructDestruct) {}
 
+TEST_F(TestAllTypesPlain, RowGroupMetaData) {
+  auto group = reader_->RowGroup(0);
+  CheckRowGroupMetadata(group->metadata());
+}
+
 TEST_F(TestAllTypesPlain, TestBatchRead) {
   std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
 
@@ -289,10 +317,11 @@ Number of RowGroups: 1
 Number of Real Columns: 2
 Number of Columns: 2
 Number of Selected Columns: 2
-Column 0: a.list.element.list.element.list.element (BYTE_ARRAY/UTF8)
+Column 0: a.list.element.list.element.list.element (BYTE_ARRAY / String / UTF8)
 Column 1: b (INT32)
 --- Row Group: 0 ---
 --- Total Bytes: 155 ---
+--- Total Compressed Bytes: 0 ---
 --- Rows: 3 ---
 Column 0
   Values: 18  Statistics Not Set
@@ -392,7 +421,7 @@ TEST(TestJSONWithLocalFile, JSONOutput) {
   ],
   "RowGroups": [
      {
-       "Id": "0",  "TotalBytes": "671",  "Rows": "8",
+       "Id": "0",  "TotalBytes": "671",  "TotalCompressedBytes": "0",  "Rows": "8",
        "ColumnChunks": [
           {"Id": "0", "Values": "8", "StatsSet": "False",
            "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "73", "CompressedSize": "73" },
@@ -527,6 +556,7 @@ class TestCodec : public ::testing::TestWithParam<std::string> {
 TEST_P(TestCodec, FileMetadataAndValues) {
   std::unique_ptr<ParquetFileReader> reader_ = ParquetFileReader::OpenFile(GetDataFile());
   std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+  const auto rg_metadata = group->metadata();
 
   // This file only has 4 rows
   ASSERT_EQ(4, reader_->metadata()->num_rows());
@@ -534,8 +564,17 @@ TEST_P(TestCodec, FileMetadataAndValues) {
   ASSERT_EQ(3, reader_->metadata()->num_columns());
   // This file only has 1 row group
   ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+
   // This row group must have 4 rows
-  ASSERT_EQ(4, group->metadata()->num_rows());
+  ASSERT_EQ(4, rg_metadata->num_rows());
+
+  // Some parquet-cpp versions are susceptible to PARQUET-2008
+  const auto& app_ver = reader_->metadata()->writer_version();
+  const bool allow_uncompressed_mismatch =
+      (app_ver.application_ == "parquet-cpp" && app_ver.version.major == 1 &&
+       app_ver.version.minor == 5 && app_ver.version.patch == 1);
+
+  CheckRowGroupMetadata(rg_metadata, allow_uncompressed_mismatch);
 
   // column 0, c0
   auto col0 = checked_pointer_cast<Int64Reader>(group->Column(0));
diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc
index f2a99c8780d..bfb295f0be3 100644
--- a/cpp/src/parquet/schema.cc
+++ b/cpp/src/parquet/schema.cc
@@ -448,7 +448,7 @@ std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
                           LogicalType::FromThrift(element->logicalType),
                           LoadEnumSafe(&element->type), element->type_length, field_id));
   } else if (element->__isset.converted_type) {
-    // legacy writer with logical type present
+    // legacy writer with converted type present
     primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
         element->name, LoadEnumSafe(&element->repetition_type),
         LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
@@ -500,7 +500,16 @@ void PrimitiveNode::ToParquet(void* opaque_element) const {
   element->__set_name(name_);
   element->__set_repetition_type(ToThrift(repetition_));
   if (converted_type_ != ConvertedType::NONE) {
-    element->__set_converted_type(ToThrift(converted_type_));
+    if (converted_type_ != ConvertedType::NA) {
+      element->__set_converted_type(ToThrift(converted_type_));
+    } else {
+      // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
+      // Never emit it (see PARQUET-1990 for discussion).
+      if (!logical_type_ || !logical_type_->is_null()) {
+        throw ParquetException(
+            "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
+      }
+    }
   }
   if (field_id_ >= 0) {
     element->__set_field_id(field_id_);
diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc
index c095e7adbf4..43760d34ab4 100644
--- a/cpp/src/parquet/schema_test.cc
+++ b/cpp/src/parquet/schema_test.cc
@@ -1101,12 +1101,12 @@ TEST(TestLogicalTypeConstruction, ConvertedTypeCompatibility) {
   ASSERT_TRUE(reconstructed->is_valid());
   ASSERT_TRUE(reconstructed->Equals(*original));
 
-  // Unknown
-  original = LogicalType::Unknown();
+  // Undefined
+  original = UndefinedLogicalType::Make();
   ASSERT_TRUE(original->is_invalid());
   ASSERT_FALSE(original->is_valid());
   converted_type = original->ToConvertedType(&converted_decimal_metadata);
-  ASSERT_EQ(converted_type, ConvertedType::NA);
+  ASSERT_EQ(converted_type, ConvertedType::UNDEFINED);
   ASSERT_FALSE(converted_decimal_metadata.isset);
   ASSERT_TRUE(original->is_compatible(converted_type, converted_decimal_metadata));
   ASSERT_TRUE(original->is_compatible(converted_type));
@@ -1243,7 +1243,6 @@ TEST(TestLogicalTypeOperation, LogicalTypeProperties) {
       {BSONLogicalType::Make(), false, true, true},
       {UUIDLogicalType::Make(), false, true, true},
       {NoLogicalType::Make(), false, false, true},
-      {UnknownLogicalType::Make(), false, false, false},
   };
 
   for (const ExpectedProperties& c : cases) {
@@ -1339,7 +1338,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeApplicability) {
   }
 
   std::vector<std::shared_ptr<const LogicalType>> any_type_cases = {
-      LogicalType::Null(), LogicalType::None(), LogicalType::Unknown()};
+      LogicalType::Null(), LogicalType::None(), UndefinedLogicalType::Make()};
 
   for (auto c : any_type_cases) {
     ConfirmAnyPrimitiveTypeApplicability(c);
@@ -1453,7 +1452,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeRepresentation) {
   };
 
   std::vector<ExpectedRepresentation> cases = {
-      {LogicalType::Unknown(), "Unknown", R"({"Type": "Unknown"})"},
+      {UndefinedLogicalType::Make(), "Undefined", R"({"Type": "Undefined"})"},
       {LogicalType::String(), "String", R"({"Type": "String"})"},
       {LogicalType::Map(), "Map", R"({"Type": "Map"})"},
       {LogicalType::List(), "List", R"({"Type": "List"})"},
@@ -1550,7 +1549,6 @@ TEST(TestLogicalTypeOperation, LogicalTypeSortOrder) {
   };
 
   std::vector<ExpectedSortOrder> cases = {
-      {LogicalType::Unknown(), SortOrder::UNKNOWN},
       {LogicalType::String(), SortOrder::UNSIGNED},
       {LogicalType::Map(), SortOrder::UNKNOWN},
       {LogicalType::List(), SortOrder::UNKNOWN},
@@ -1888,8 +1886,6 @@ TEST_F(TestSchemaElementConstruction, SimpleCases) {
       {"uuid", LogicalType::UUID(), Type::FIXED_LEN_BYTE_ARRAY, 16, false,
        ConvertedType::NA, true, [this]() { return element_->logicalType.__isset.UUID; }},
       {"none", LogicalType::None(), Type::INT64, -1, false, ConvertedType::NA, false,
-       check_nothing},
-      {"unknown", LogicalType::Unknown(), Type::INT64, -1, true, ConvertedType::NA, false,
        check_nothing}};
 
   for (const SchemaElementConstructionArguments& c : cases) {
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index ba3fefd0d4f..c9e02696f5d 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -256,6 +256,9 @@ static inline format::Type::type ToThrift(Type::type type) {
 static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
   // item 0 is NONE
   DCHECK_NE(type, ConvertedType::NONE);
+  // it is forbidden to emit "NA" (PARQUET-1990)
+  DCHECK_NE(type, ConvertedType::NA);
+  DCHECK_NE(type, ConvertedType::UNDEFINED);
   return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
 }
 
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 3d3e7c7a64f..4e5bcee4ce8 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -366,13 +366,14 @@ std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
       return JSONLogicalType::Make();
     case ConvertedType::BSON:
       return BSONLogicalType::Make();
+    case ConvertedType::NA:
+      return NullLogicalType::Make();
     case ConvertedType::NONE:
       return NoLogicalType::Make();
-    case ConvertedType::NA:
     case ConvertedType::UNDEFINED:
-      return UnknownLogicalType::Make();
+      return UndefinedLogicalType::Make();
   }
-  return UnknownLogicalType::Make();
+  return UndefinedLogicalType::Make();
 }
 
 std::shared_ptr<const LogicalType> LogicalType::FromThrift(
@@ -483,10 +484,6 @@ std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType:
 
 std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
 
-std::shared_ptr<const LogicalType> LogicalType::Unknown() {
-  return UnknownLogicalType::Make();
-}
-
 /*
  * The logical type implementation classes are built in four layers: (1) the base
  * layer, which establishes the interface and provides generally reusable implementations
@@ -516,7 +513,7 @@ class LogicalType::Impl {
   virtual std::string ToString() const = 0;
 
   virtual bool is_serialized() const {
-    return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNKNOWN);
+    return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
   }
 
   virtual std::string ToJSON() const {
@@ -567,14 +564,14 @@ class LogicalType::Impl {
   class BSON;
   class UUID;
   class No;
-  class Unknown;
+  class Undefined;
 
  protected:
   Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
   Impl() = default;
 
  private:
-  LogicalType::Type::type type_ = LogicalType::Type::UNKNOWN;
+  LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
   SortOrder::type order_ = SortOrder::UNKNOWN;
 };
 
@@ -636,7 +633,9 @@ bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::J
 bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
 bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
 bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
-bool LogicalType::is_valid() const { return impl_->type() != LogicalType::Type::UNKNOWN; }
+bool LogicalType::is_valid() const {
+  return impl_->type() != LogicalType::Type::UNDEFINED;
+}
 bool LogicalType::is_invalid() const { return !is_valid(); }
 bool LogicalType::is_nested() const {
   return (impl_->type() == LogicalType::Type::LIST) ||
@@ -1555,19 +1554,19 @@ class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
 
 GENERATE_MAKE(No)
 
-class LogicalType::Impl::Unknown final : public LogicalType::Impl::SimpleCompatible,
-                                         public LogicalType::Impl::UniversalApplicable {
+class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
+                                           public LogicalType::Impl::UniversalApplicable {
  public:
-  friend class UnknownLogicalType;
+  friend class UndefinedLogicalType;
 
-  OVERRIDE_TOSTRING(Unknown)
+  OVERRIDE_TOSTRING(Undefined)
 
  private:
-  Unknown()
-      : LogicalType::Impl(LogicalType::Type::UNKNOWN, SortOrder::UNKNOWN),
-        LogicalType::Impl::SimpleCompatible(ConvertedType::NA) {}
+  Undefined()
+      : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
+        LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
 };
 
-GENERATE_MAKE(Unknown)
+GENERATE_MAKE(Undefined)
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 953da832fbb..f3d3abfc918 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -71,7 +71,7 @@ struct Type {
 // Mirrors parquet::ConvertedType
 struct ConvertedType {
   enum type {
-    NONE,
+    NONE,  // Not a real converted type, but means no converted type is specified
     UTF8,
     MAP,
     MAP_KEY_VALUE,
@@ -94,9 +94,12 @@ struct ConvertedType {
     JSON,
     BSON,
     INTERVAL,
+    // DEPRECATED INVALID ConvertedType for all-null data.
+    // Only useful for reading legacy files written out by interim Parquet C++ releases.
+    // For writing, always emit LogicalType::Null instead.
+    // See PARQUET-1990.
     NA = 25,
-    // Should always be last element.
-    UNDEFINED = 26
+    UNDEFINED = 26  // Not a real converted type; should always be last element
   };
 };
 
@@ -140,7 +143,7 @@ class PARQUET_EXPORT LogicalType {
  public:
   struct Type {
     enum type {
-      UNKNOWN = 0,
+      UNDEFINED = 0,  // Not a real logical type
       STRING = 1,
       MAP,
       LIST,
@@ -151,11 +154,11 @@ class PARQUET_EXPORT LogicalType {
       TIMESTAMP,
       INTERVAL,
       INT,
-      NIL,  // Thrift NullType
+      NIL,  // Thrift NullType: annotates data that is always null
       JSON,
       BSON,
       UUID,
-      NONE
+      NONE  // Not a real logical type; should always be last element
     };
   };
 
@@ -199,12 +202,18 @@ class PARQUET_EXPORT LogicalType {
 
   static std::shared_ptr<const LogicalType> Interval();
   static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
+
+  /// \brief Create a logical type for data that's always null
+  ///
+  /// Any physical type can be annotated with this logical type.
   static std::shared_ptr<const LogicalType> Null();
+
   static std::shared_ptr<const LogicalType> JSON();
   static std::shared_ptr<const LogicalType> BSON();
   static std::shared_ptr<const LogicalType> UUID();
+
+  /// \brief Create a placeholder for when no logical type is specified
   static std::shared_ptr<const LogicalType> None();
-  static std::shared_ptr<const LogicalType> Unknown();
 
   /// \brief Return true if this logical type is consistent with the given underlying
   /// physical type.
@@ -434,13 +443,13 @@ class PARQUET_EXPORT NoLogicalType : public LogicalType {
   NoLogicalType() = default;
 };
 
-/// \brief Allowed for any type.
-class PARQUET_EXPORT UnknownLogicalType : public LogicalType {
+// Internal API, for unrecognized logical types
+class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
  public:
   static std::shared_ptr<const LogicalType> Make();
 
  private:
-  UnknownLogicalType() = default;
+  UndefinedLogicalType() = default;
 };
 
 // Data encodings. Mirrors parquet::Encoding
diff --git a/csharp/README.md b/csharp/README.md
index 29c6086379b..2a60cd27c5c 100644
--- a/csharp/README.md
+++ b/csharp/README.md
@@ -90,6 +90,7 @@ This implementation is under development and may not be suitable for use in prod
 - Timestamp
 - Date32
 - Date64
+- Decimal
 - Time32
 - Time64
 - Binary (fixed-length)
@@ -124,7 +125,6 @@ This implementation is under development and may not be suitable for use in prod
         - Dense
         - Sparse
     - Half-Float
-    - Decimal
     - Dictionary
 - Array Operations
 	- Equality / Comparison
diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs
index e654bb9dd94..e7360942f3c 100644
--- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs
+++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs
@@ -1,4 +1,4 @@
-﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// Licensed to the Apache Software Foundation (ASF) under one or more
 // contributor license agreements. See the NOTICE file distributed with
 // this work for additional information regarding copyright ownership.
 // The ASF licenses this file to You under the Apache License, Version 2.0
@@ -58,9 +58,12 @@ internal static IArrowArrayBuilder<IArrowArray, IArrowArrayBuilder<IArrowArray>>
                     return new Date32Array.Builder();
                 case ArrowTypeId.List:
                     return new ListArray.Builder(dataType as ListType);
+                case ArrowTypeId.Decimal128:
+                    return new Decimal128Array.Builder(dataType as Decimal128Type);
+                case ArrowTypeId.Decimal256:
+                    return new Decimal256Array.Builder(dataType as Decimal256Type);
                 case ArrowTypeId.Struct:
                 case ArrowTypeId.Union:
-                case ArrowTypeId.Decimal:
                 case ArrowTypeId.Dictionary:
                 case ArrowTypeId.FixedSizedBinary:
                 case ArrowTypeId.HalfFloat:
diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
index f3452e312e9..c3429230cc6 100644
--- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
+++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
@@ -62,7 +62,10 @@ public static IArrowArray BuildArray(ArrayData data)
                     return new Date64Array(data);
                 case ArrowTypeId.Date32:
                     return new Date32Array(data);
-                case ArrowTypeId.Decimal:
+                case ArrowTypeId.Decimal128:
+                    return new Decimal128Array(data);
+                case ArrowTypeId.Decimal256:
+                    return new Decimal256Array(data);
                 case ArrowTypeId.Dictionary:
                 case ArrowTypeId.FixedSizedBinary:
                 case ArrowTypeId.HalfFloat:
diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs
new file mode 100644
index 00000000000..128e9e5f081
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs
@@ -0,0 +1,95 @@
+﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Numerics;
+using Apache.Arrow.Arrays;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow
+{
+    public class Decimal128Array : FixedSizeBinaryArray
+    {
+        public class Builder : BuilderBase<Decimal128Array, Builder>
+        {
+            public Builder(Decimal128Type type) : base(type, 16)
+            {
+                DataType = type;
+            }
+
+            protected new Decimal128Type DataType { get; }
+
+            protected override Decimal128Array Build(ArrayData data)
+            {
+                return new Decimal128Array(data);
+            }
+
+            public Builder Append(decimal value)
+            {
+                Span<byte> bytes = stackalloc byte[DataType.ByteWidth];
+                DecimalUtility.GetBytes(value, DataType.Precision, DataType.Scale, DataType.ByteWidth, bytes);
+
+                return Append(bytes);
+            }
+
+            public Builder AppendRange(IEnumerable<decimal> values)
+            {
+                if (values == null)
+                {
+                    throw new ArgumentNullException(nameof(values));
+                }
+
+                foreach (decimal d in values)
+                {
+                    Append(d);
+                }
+
+                return Instance;
+            }
+
+            public Builder Set(int index, decimal value)
+            {
+                Span<byte> bytes = stackalloc byte[DataType.ByteWidth];
+                DecimalUtility.GetBytes(value, DataType.Precision, DataType.Scale, DataType.ByteWidth, bytes);
+
+                return Set(index, bytes);
+            }
+        }
+
+        public Decimal128Array(ArrayData data)
+            : base(ArrowTypeId.Decimal128, data)
+        {
+            data.EnsureDataType(ArrowTypeId.Decimal128);
+            data.EnsureBufferCount(2);
+            Debug.Assert(Data.DataType is Decimal128Type);
+        }
+        public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);
+
+        public int Scale => ((Decimal128Type)Data.DataType).Scale;
+        public int Precision => ((Decimal128Type)Data.DataType).Precision;
+        public int ByteWidth => ((Decimal128Type)Data.DataType).ByteWidth;
+
+        public decimal? GetValue(int index)
+        {
+            if (IsNull(index))
+            {
+                return null;
+            }
+            return DecimalUtility.GetDecimal(ValueBuffer, index, Scale, ByteWidth);
+        }
+    }
+}
diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs
new file mode 100644
index 00000000000..fb4cd6be396
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs
@@ -0,0 +1,96 @@
+﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Numerics;
+using Apache.Arrow.Arrays;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow
+{
+    public class Decimal256Array : FixedSizeBinaryArray
+    {
+        public class Builder : BuilderBase<Decimal256Array, Builder>
+        {
+            public Builder(Decimal256Type type) : base(type, 32)
+            {
+                DataType = type;
+            }
+
+            protected new Decimal256Type DataType { get; }
+
+            protected override Decimal256Array Build(ArrayData data)
+            {
+                return new Decimal256Array(data);
+            }
+
+            public Builder Append(decimal value)
+            {
+                Span<byte> bytes = stackalloc byte[DataType.ByteWidth];
+                DecimalUtility.GetBytes(value, DataType.Precision, DataType.Scale, DataType.ByteWidth, bytes);
+
+                return Append(bytes);
+            }
+
+            public Builder AppendRange(IEnumerable<decimal> values)
+            {
+                if (values == null)
+                {
+                    throw new ArgumentNullException(nameof(values));
+                }
+
+                foreach (decimal d in values)
+                {
+                    Append(d);
+                }
+
+                return Instance;
+            }
+
+            public Builder Set(int index, decimal value)
+            {
+                Span<byte> bytes = stackalloc byte[DataType.ByteWidth];
+                DecimalUtility.GetBytes(value, DataType.Precision, DataType.Scale, DataType.ByteWidth, bytes);
+
+                return Set(index, bytes);
+            }
+        }
+
+        public Decimal256Array(ArrayData data)
+            : base(ArrowTypeId.Decimal256, data)
+        {
+            data.EnsureDataType(ArrowTypeId.Decimal256);
+            data.EnsureBufferCount(2);
+            Debug.Assert(Data.DataType is Decimal256Type);
+        }
+        public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);
+
+        public int Scale => ((Decimal256Type)Data.DataType).Scale;
+        public int Precision => ((Decimal256Type)Data.DataType).Precision;
+        public int ByteWidth => ((Decimal256Type)Data.DataType).ByteWidth;
+
+        public decimal? GetValue(int index)
+        {
+            if (IsNull(index))
+            {
+                return null;
+            }
+
+            return DecimalUtility.GetDecimal(ValueBuffer, index, Scale, ByteWidth);
+        }
+    }
+}
diff --git a/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs
new file mode 100644
index 00000000000..c3cf2fc5c30
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using Apache.Arrow.Memory;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow.Arrays
+{
+    public class FixedSizeBinaryArray : Array
+    {
+        public FixedSizeBinaryArray(ArrayData data)
+            : base(data)
+        {
+            data.EnsureDataType(ArrowTypeId.FixedSizedBinary);
+            data.EnsureBufferCount(2);
+        }
+
+        public FixedSizeBinaryArray(ArrowTypeId typeId, ArrayData data)
+            : base(data)
+        {
+            data.EnsureDataType(typeId);
+            data.EnsureBufferCount(2);
+        }
+
+        public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);
+
+        public ArrowBuffer ValueBuffer => Data.Buffers[1];
+
+        public abstract class BuilderBase<TArray, TBuilder> : IArrowArrayBuilder<byte[], TArray, TBuilder>
+            where TArray : IArrowArray
+            where TBuilder : class, IArrowArrayBuilder<byte[], TArray, TBuilder>
+        {
+            protected IArrowType DataType { get; }
+            protected TBuilder Instance => this as TBuilder;
+            protected int ByteWidth { get; }
+            protected ArrowBuffer.Builder<byte> ValueBuffer { get; }
+            protected ArrowBuffer.BitmapBuilder ValidityBuffer { get; }
+            public int Length => ValueBuffer.Length / ByteWidth;
+            protected int NullCount => this.ValidityBuffer.UnsetBitCount;
+            protected abstract TArray Build(ArrayData data);
+
+            protected BuilderBase(IArrowType dataType, int byteWidth)
+            {
+                DataType = dataType;
+                ByteWidth = byteWidth;
+                ValueBuffer = new ArrowBuffer.Builder<byte>();
+                ValidityBuffer = new ArrowBuffer.BitmapBuilder();
+            }
+
+            public TArray Build(MemoryAllocator allocator = default)
+            {
+                var bufs = new[]
+                {
+                    NullCount > 0 ? ValidityBuffer.Build(allocator) : ArrowBuffer.Empty,
+                    ValueBuffer.Build(ByteWidth, allocator),
+                };
+                var data = new ArrayData(
+                    DataType,
+                    Length,
+                    NullCount,
+                    0,
+                    bufs);
+
+                return Build(data);
+            }
+
+            public TBuilder Reserve(int capacity)
+            {
+                ValueBuffer.Reserve(capacity * ByteWidth);
+                ValidityBuffer.Reserve(capacity + 1);
+                return Instance;
+            }
+
+            public TBuilder Resize(int length)
+            {
+                ValueBuffer.Resize(length * ByteWidth);
+                ValidityBuffer.Resize(length + 1);
+                return Instance;
+            }
+
+            public TBuilder Clear() {
+
+                ValueBuffer.Clear();
+                ValidityBuffer.Clear();
+
+                return Instance;
+            }
+
+            public TBuilder Append(byte[] value)
+            {
+                if(value.Length % ByteWidth != 0)
+                    throw new ArgumentOutOfRangeException("Bytes of length: " + value.Length + " do not conform to the fixed size: " + ByteWidth);
+                return Append(value.AsSpan());
+            }
+            public TBuilder Append(ReadOnlySpan<byte[]> span)
+            {
+                foreach (var b in span)
+                {
+                    Append(b);
+                }
+
+                return Instance;
+            }
+
+            public TBuilder AppendRange(IEnumerable<byte[]> values)
+            {
+                if (values == null)
+                {
+                    throw new ArgumentNullException(nameof(values));
+                }
+
+                foreach (byte[] b in values)
+                {
+                    Append(b);
+                }
+
+                return Instance;
+            }
+
+            public TBuilder Append(ReadOnlySpan<byte> span)
+            {
+                ValueBuffer.Append(span);
+                ValidityBuffer.Append(true);
+                return Instance;
+            }
+
+            public TBuilder AppendNull()
+            {
+                ValueBuffer.Append(new byte[ByteWidth]);
+                ValidityBuffer.Append(false);
+                return Instance;
+            }
+
+            public TBuilder Swap(int i, int j)
+            {
+                int iStart = i * ByteWidth;
+                int jStart = j * ByteWidth;
+                byte[] iBytes = ValueBuffer.Span.Slice(iStart, ByteWidth).ToArray();
+                Span<byte> jBytes = ValueBuffer.Span.Slice(jStart, ByteWidth);
+
+                for (int m = 0; m < ByteWidth; m++)
+                {
+                    ValueBuffer.Span[iStart + m] = jBytes[m];
+                    ValueBuffer.Span[jStart + m] = iBytes[m];
+                }
+
+                ValidityBuffer.Swap(i, j);
+                return Instance;
+            }
+
+            public TBuilder Set(int index, byte[] value)
+            {
+                return Set(index, value.AsSpan());
+            }
+
+            public TBuilder Set(int index, ReadOnlySpan<byte> value)
+            {
+                int startIndex = index * ByteWidth;
+                for (int i = 0; i < ByteWidth; i++)
+                {
+                    ValueBuffer.Span[startIndex + i] = value[i];
+                }
+
+                ValidityBuffer.Set(index, true);
+                return Instance;
+            }
+
+            public TBuilder SetNull(int index)
+            {
+                int startIndex = index * ByteWidth;
+                for (int i = 0; i < ByteWidth; i++)
+                {
+                    ValueBuffer.Span[startIndex + i] = 0;
+                }
+
+                ValidityBuffer.Set(index, false);
+                return Instance;
+            }
+
+        }
+    }
+}
diff --git a/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs b/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs
index 817282e9c0d..7c03027feff 100644
--- a/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs
+++ b/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs
@@ -200,9 +200,19 @@ public Builder<T> Clear()
             /// <param name="allocator">Optional memory allocator.</param>
             /// <returns>Returns an <see cref="ArrowBuffer"/> object.</returns>
             public ArrowBuffer Build(MemoryAllocator allocator = default)
+            {
+                return Build(64, allocator);
+            }
+
+            /// <summary>
+            /// Build an Arrow buffer from the appended contents so far of the specified byte size.
+            /// </summary>
+            /// <param name="allocator">Optional memory allocator.</param>
+            /// <returns>Returns an <see cref="ArrowBuffer"/> object.</returns>
+            internal ArrowBuffer Build(int byteSize, MemoryAllocator allocator = default)
             {
                 int currentBytesLength = Length * _size;
-                int bufferLength = checked((int)BitUtility.RoundUpToMultipleOf64(currentBytesLength));
+                int bufferLength = checked((int)BitUtility.RoundUpToMultiplePowerOfTwo(currentBytesLength, byteSize));
 
                 MemoryAllocator memoryAllocator = allocator ?? MemoryAllocator.Default.Value;
                 IMemoryOwner<byte> memoryOwner = memoryAllocator.Allocate(bufferLength);
diff --git a/csharp/src/Apache.Arrow/DecimalUtility.cs b/csharp/src/Apache.Arrow/DecimalUtility.cs
new file mode 100644
index 00000000000..b7ee6b9a87a
--- /dev/null
+++ b/csharp/src/Apache.Arrow/DecimalUtility.cs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Numerics;
+
+namespace Apache.Arrow
+{
+    /// <summary>
+    /// This is semi-optimised best attempt at converting to / from decimal and the buffers
+    /// </summary>
+    internal static class DecimalUtility
+    {
+        private static readonly BigInteger _maxDecimal = new BigInteger(decimal.MaxValue);
+        private static readonly BigInteger _minDecimal = new BigInteger(decimal.MinValue);
+        private static readonly ulong[] s_powersOfTen =
+        {
+            1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000,
+            1000000000000, 10000000000000, 100000000000000, 1000000000000000, 10000000000000000, 100000000000000000,
+            1000000000000000000, 10000000000000000000
+        };
+
+        private static int PowersOfTenLength => s_powersOfTen.Length - 1;
+
+        internal static decimal GetDecimal(in ArrowBuffer valueBuffer, int index, int scale, int byteWidth)
+        {
+            int startIndex = index * byteWidth;
+            ReadOnlySpan<byte> value = valueBuffer.Span.Slice(startIndex, byteWidth);
+            BigInteger integerValue;
+
+#if NETCOREAPP
+            integerValue = new BigInteger(value);
+#else
+            integerValue = new BigInteger(value.ToArray());
+#endif
+
+            if (integerValue > _maxDecimal || integerValue < _minDecimal)
+            {
+                BigInteger scaleBy = BigInteger.Pow(10, scale);
+                BigInteger integerPart = BigInteger.DivRem(integerValue, scaleBy, out BigInteger fractionalPart);
+                if (integerPart > _maxDecimal || integerPart < _minDecimal) // decimal overflow, not much we can do here - C# needs a BigDecimal
+                {
+                    throw new OverflowException($"Value: {integerPart} too big or too small to be represented as a decimal");
+                }
+                return (decimal)integerPart + DivideByScale(fractionalPart, scale);
+            }
+            else
+            {
+                return DivideByScale(integerValue, scale);
+            }
+        }
+
+        private static decimal DivideByScale(BigInteger integerValue, int scale)
+        {
+            decimal result = (decimal)integerValue; // this cast is safe here
+            int drop = scale;
+            while (drop > PowersOfTenLength)
+            {
+                result /= s_powersOfTen[PowersOfTenLength];
+                drop -= PowersOfTenLength;
+            }
+
+            result /= s_powersOfTen[drop];
+            return result;
+        }
+
+        internal static void GetBytes(decimal value, int precision, int scale, int byteWidth, Span<byte> bytes)
+        {
+            // create BigInteger from decimal
+            BigInteger bigInt;
+            int[] decimalBits = decimal.GetBits(value);
+            int decScale = (decimalBits[3] >> 16) & 0x7F;
+#if NETCOREAPP
+            Span<byte> bigIntBytes = stackalloc byte[12];
+
+                for (int i = 0; i < 3; i++)
+                {
+                    int bit = decimalBits[i];
+                    Span<byte> intBytes = stackalloc byte[4];
+                    if (!BitConverter.TryWriteBytes(intBytes, bit))
+                        throw new OverflowException($"Could not extract bytes from int {bit}");
+
+                    for (int j = 0; j < 4; j++)
+                    {
+                        bigIntBytes[4 * i + j] = intBytes[j];
+                    }
+                }
+                bigInt = new BigInteger(bigIntBytes);
+#else
+            byte[] bigIntBytes = new byte[12];
+                for (int i = 0; i < 3; i++)
+                {
+                    int bit = decimalBits[i];
+                    byte[] intBytes = BitConverter.GetBytes(bit);
+                    for (int j = 0; j < intBytes.Length; j++)
+                    {
+                        bigIntBytes[4 * i + j] = intBytes[j];
+                    }
+                }
+                bigInt = new BigInteger(bigIntBytes);
+#endif
+
+            if (value < 0)
+            {
+                bigInt = -bigInt;
+            }
+
+            // validate precision and scale
+            if (decScale > scale)
+                throw new OverflowException($"Decimal scale cannot be greater than that in the Arrow vector: {decScale} != {scale}");
+
+            if (bigInt >= BigInteger.Pow(10, precision))
+                throw new OverflowException($"Decimal precision cannot be greater than that in the Arrow vector: {value} has precision > {precision}");
+
+            if (decScale < scale) // pad with trailing zeros
+            {
+                bigInt *= BigInteger.Pow(10, scale - decScale);
+            }
+
+            // extract bytes from BigInteger
+            if (bytes.Length != byteWidth)
+            {
+                throw new OverflowException($"ValueBuffer size not equal to {byteWidth} byte width: {bytes.Length}");
+            }
+
+            int bytesWritten;
+#if NETCOREAPP
+            if (!bigInt.TryWriteBytes(bytes, out bytesWritten, false, !BitConverter.IsLittleEndian))
+                throw new OverflowException("Could not extract bytes from integer value " + bigInt);
+#else
+            byte[] tempBytes = bigInt.ToByteArray();
+            tempBytes.CopyTo(bytes);
+            bytesWritten = tempBytes.Length;
+#endif
+
+            if (bytes.Length > byteWidth)
+            {
+                throw new OverflowException($"Decimal size greater than {byteWidth} bytes: {bytes.Length}");
+            }
+
+            if (bigInt.Sign == -1)
+            {
+                for (int i = bytesWritten; i < byteWidth; i++)
+                {
+                    bytes[i] = 255;
+                }
+            }
+        }
+    }
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/BodyCompression.cs b/csharp/src/Apache.Arrow/Flatbuf/BodyCompression.cs
new file mode 100644
index 00000000000..dda0dd40351
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/BodyCompression.cs
@@ -0,0 +1,47 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+using global::System;
+using global::FlatBuffers;
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+internal struct BodyCompression : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static BodyCompression GetRootAsBodyCompression(ByteBuffer _bb) { return GetRootAsBodyCompression(_bb, new BodyCompression()); }
+  public static BodyCompression GetRootAsBodyCompression(ByteBuffer _bb, BodyCompression obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p.bb_pos = _i; __p.bb = _bb; }
+  public BodyCompression __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  /// Compressor library
+  public CompressionType Codec { get { int o = __p.__offset(4); return o != 0 ? (CompressionType)__p.bb.GetSbyte(o + __p.bb_pos) : CompressionType.LZ4_FRAME; } }
+  /// Indicates the way the record batch body was compressed
+  public BodyCompressionMethod Method { get { int o = __p.__offset(6); return o != 0 ? (BodyCompressionMethod)__p.bb.GetSbyte(o + __p.bb_pos) : BodyCompressionMethod.BUFFER; } }
+
+  public static Offset<BodyCompression> CreateBodyCompression(FlatBufferBuilder builder,
+      CompressionType codec = CompressionType.LZ4_FRAME,
+      BodyCompressionMethod method = BodyCompressionMethod.BUFFER) {
+    builder.StartObject(2);
+    BodyCompression.AddMethod(builder, method);
+    BodyCompression.AddCodec(builder, codec);
+    return BodyCompression.EndBodyCompression(builder);
+  }
+
+  public static void StartBodyCompression(FlatBufferBuilder builder) { builder.StartObject(2); }
+  public static void AddCodec(FlatBufferBuilder builder, CompressionType codec) { builder.AddSbyte(0, (sbyte)codec, 0); }
+  public static void AddMethod(FlatBufferBuilder builder, BodyCompressionMethod method) { builder.AddSbyte(1, (sbyte)method, 0); }
+  public static Offset<BodyCompression> EndBodyCompression(FlatBufferBuilder builder) {
+    int o = builder.EndObject();
+    return new Offset<BodyCompression>(o);
+  }
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/BodyCompressionMethod.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/BodyCompressionMethod.cs
new file mode 100644
index 00000000000..e9f6b6e831e
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/BodyCompressionMethod.cs
@@ -0,0 +1,24 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+internal enum BodyCompressionMethod : sbyte
+{
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+ BUFFER = 0,
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/CompressionType.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/CompressionType.cs
new file mode 100644
index 00000000000..3d886c50815
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/CompressionType.cs
@@ -0,0 +1,15 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+internal enum CompressionType : sbyte
+{
+ LZ4_FRAME = 0,
+ ZSTD = 1,
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/Feature.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/Feature.cs
new file mode 100644
index 00000000000..a05b6cf496c
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/Feature.cs
@@ -0,0 +1,39 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intented to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+internal enum Feature : long
+{
+  /// Needed to make flatbuffers happy.
+ UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+ DICTIONARY_REPLACEMENT = 1,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+ COMPRESSED_BODY = 2,
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs
index 9d5c93512d4..1e893e8cb6f 100644
--- a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs
+++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs
@@ -7,14 +7,22 @@ namespace Apache.Arrow.Flatbuf
 
 internal enum MetadataVersion : short
 {
-  /// 0.1.0
+  /// 0.1.0 (October 2016).
  V1 = 0,
-  /// 0.2.0
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
  V2 = 1,
-  /// 0.3.0 -> 0.7.1
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
  V3 = 2,
-  /// >= 0.8.0
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
  V4 = 3,
+  /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+ V5 = 4,
 };
 
 
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs
index 5dcb1260b49..e8a7932a703 100644
--- a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs
+++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs
@@ -28,6 +28,10 @@ internal enum Type : byte
  FixedSizeBinary = 15,
  FixedSizeList = 16,
  Map = 17,
+ Duration = 18,
+ LargeBinary = 19,
+ LargeUtf8 = 20,
+ LargeList = 21,
 };
 
 
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Footer.cs b/csharp/src/Apache.Arrow/Flatbuf/Footer.cs
index d9f50a23c84..37dbfef9409 100644
--- a/csharp/src/Apache.Arrow/Flatbuf/Footer.cs
+++ b/csharp/src/Apache.Arrow/Flatbuf/Footer.cs
@@ -26,13 +26,18 @@ internal struct Footer : IFlatbufferObject
   public int DictionariesLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } }
   public Block? RecordBatches(int j) { int o = __p.__offset(10); return o != 0 ? (Block?)(new Block()).__assign(__p.__vector(o) + j * 24, __p.bb) : null; }
   public int RecordBatchesLength { get { int o = __p.__offset(10); return o != 0 ? __p.__vector_len(o) : 0; } }
+  /// User-defined metadata
+  public KeyValue? CustomMetadata(int j) { int o = __p.__offset(12); return o != 0 ? (KeyValue?)(new KeyValue()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; }
+  public int CustomMetadataLength { get { int o = __p.__offset(12); return o != 0 ? __p.__vector_len(o) : 0; } }
 
   public static Offset<Footer> CreateFooter(FlatBufferBuilder builder,
       MetadataVersion version = MetadataVersion.V1,
       Offset<Schema> schemaOffset = default(Offset<Schema>),
       VectorOffset dictionariesOffset = default(VectorOffset),
-      VectorOffset recordBatchesOffset = default(VectorOffset)) {
-    builder.StartObject(4);
+      VectorOffset recordBatchesOffset = default(VectorOffset),
+      VectorOffset custom_metadataOffset = default(VectorOffset)) {
+    builder.StartObject(5);
+    Footer.AddCustomMetadata(builder, custom_metadataOffset);
     Footer.AddRecordBatches(builder, recordBatchesOffset);
     Footer.AddDictionaries(builder, dictionariesOffset);
     Footer.AddSchema(builder, schemaOffset);
@@ -40,13 +45,17 @@ public static Offset<Footer> CreateFooter(FlatBufferBuilder builder,
     return Footer.EndFooter(builder);
   }
 
-  public static void StartFooter(FlatBufferBuilder builder) { builder.StartObject(4); }
+  public static void StartFooter(FlatBufferBuilder builder) { builder.StartObject(5); }
   public static void AddVersion(FlatBufferBuilder builder, MetadataVersion version) { builder.AddShort(0, (short)version, 0); }
   public static void AddSchema(FlatBufferBuilder builder, Offset<Schema> schemaOffset) { builder.AddOffset(1, schemaOffset.Value, 0); }
   public static void AddDictionaries(FlatBufferBuilder builder, VectorOffset dictionariesOffset) { builder.AddOffset(2, dictionariesOffset.Value, 0); }
   public static void StartDictionariesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(24, numElems, 8); }
   public static void AddRecordBatches(FlatBufferBuilder builder, VectorOffset recordBatchesOffset) { builder.AddOffset(3, recordBatchesOffset.Value, 0); }
   public static void StartRecordBatchesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(24, numElems, 8); }
+  public static void AddCustomMetadata(FlatBufferBuilder builder, VectorOffset customMetadataOffset) { builder.AddOffset(4, customMetadataOffset.Value, 0); }
+  public static VectorOffset CreateCustomMetadataVector(FlatBufferBuilder builder, Offset<KeyValue>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateCustomMetadataVectorBlock(FlatBufferBuilder builder, Offset<KeyValue>[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartCustomMetadataVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
   public static Offset<Footer> EndFooter(FlatBufferBuilder builder) {
     int o = builder.EndObject();
     return new Offset<Footer>(o);
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Message.cs b/csharp/src/Apache.Arrow/Flatbuf/Message.cs
index 0bf9ffa5314..db54e6ab32c 100644
--- a/csharp/src/Apache.Arrow/Flatbuf/Message.cs
+++ b/csharp/src/Apache.Arrow/Flatbuf/Message.cs
@@ -21,25 +21,33 @@ internal struct Message : IFlatbufferObject
   public MessageHeader HeaderType { get { int o = __p.__offset(6); return o != 0 ? (MessageHeader)__p.bb.Get(o + __p.bb_pos) : MessageHeader.NONE; } }
   public TTable? Header<TTable>() where TTable : struct, IFlatbufferObject { int o = __p.__offset(8); return o != 0 ? (TTable?)__p.__union<TTable>(o) : null; }
   public long BodyLength { get { int o = __p.__offset(10); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long)0; } }
+  public KeyValue? CustomMetadata(int j) { int o = __p.__offset(12); return o != 0 ? (KeyValue?)(new KeyValue()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; }
+  public int CustomMetadataLength { get { int o = __p.__offset(12); return o != 0 ? __p.__vector_len(o) : 0; } }
 
   public static Offset<Message> CreateMessage(FlatBufferBuilder builder,
       MetadataVersion version = MetadataVersion.V1,
       MessageHeader header_type = MessageHeader.NONE,
       int headerOffset = 0,
-      long bodyLength = 0) {
-    builder.StartObject(4);
+      long bodyLength = 0,
+      VectorOffset custom_metadataOffset = default(VectorOffset)) {
+    builder.StartObject(5);
     Message.AddBodyLength(builder, bodyLength);
+    Message.AddCustomMetadata(builder, custom_metadataOffset);
     Message.AddHeader(builder, headerOffset);
     Message.AddVersion(builder, version);
     Message.AddHeaderType(builder, header_type);
     return Message.EndMessage(builder);
   }
 
-  public static void StartMessage(FlatBufferBuilder builder) { builder.StartObject(4); }
+  public static void StartMessage(FlatBufferBuilder builder) { builder.StartObject(5); }
   public static void AddVersion(FlatBufferBuilder builder, MetadataVersion version) { builder.AddShort(0, (short)version, 0); }
   public static void AddHeaderType(FlatBufferBuilder builder, MessageHeader headerType) { builder.AddByte(1, (byte)headerType, 0); }
   public static void AddHeader(FlatBufferBuilder builder, int headerOffset) { builder.AddOffset(2, headerOffset, 0); }
   public static void AddBodyLength(FlatBufferBuilder builder, long bodyLength) { builder.AddLong(3, bodyLength, 0); }
+  public static void AddCustomMetadata(FlatBufferBuilder builder, VectorOffset customMetadataOffset) { builder.AddOffset(4, customMetadataOffset.Value, 0); }
+  public static VectorOffset CreateCustomMetadataVector(FlatBufferBuilder builder, Offset<KeyValue>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateCustomMetadataVectorBlock(FlatBufferBuilder builder, Offset<KeyValue>[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartCustomMetadataVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
   public static Offset<Message> EndMessage(FlatBufferBuilder builder) {
     int o = builder.EndObject();
     return new Offset<Message>(o);
diff --git a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs
index 9bd1acfc374..4c95acde9d8 100644
--- a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs
+++ b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs
@@ -34,24 +34,29 @@ internal struct RecordBatch : IFlatbufferObject
   /// single buffer for the validity (nulls) bitmap
   public Buffer? Buffers(int j) { int o = __p.__offset(8); return o != 0 ? (Buffer?)(new Buffer()).__assign(__p.__vector(o) + j * 16, __p.bb) : null; }
   public int BuffersLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } }
+  /// Optional compression of the message body
+  public BodyCompression? Compression { get { int o = __p.__offset(10); return o != 0 ? (BodyCompression?)(new BodyCompression()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
 
   public static Offset<RecordBatch> CreateRecordBatch(FlatBufferBuilder builder,
       long length = 0,
       VectorOffset nodesOffset = default(VectorOffset),
-      VectorOffset buffersOffset = default(VectorOffset)) {
-    builder.StartObject(3);
+      VectorOffset buffersOffset = default(VectorOffset),
+      Offset<BodyCompression> compressionOffset = default(Offset<BodyCompression>)) {
+    builder.StartObject(4);
     RecordBatch.AddLength(builder, length);
+    RecordBatch.AddCompression(builder, compressionOffset);
     RecordBatch.AddBuffers(builder, buffersOffset);
     RecordBatch.AddNodes(builder, nodesOffset);
     return RecordBatch.EndRecordBatch(builder);
   }
 
-  public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartObject(3); }
+  public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartObject(4); }
   public static void AddLength(FlatBufferBuilder builder, long length) { builder.AddLong(0, length, 0); }
   public static void AddNodes(FlatBufferBuilder builder, VectorOffset nodesOffset) { builder.AddOffset(1, nodesOffset.Value, 0); }
   public static void StartNodesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); }
   public static void AddBuffers(FlatBufferBuilder builder, VectorOffset buffersOffset) { builder.AddOffset(2, buffersOffset.Value, 0); }
   public static void StartBuffersVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); }
+  public static void AddCompression(FlatBufferBuilder builder, Offset<BodyCompression> compressionOffset) { builder.AddOffset(3, compressionOffset.Value, 0); }
   public static Offset<RecordBatch> EndRecordBatch(FlatBufferBuilder builder) {
     int o = builder.EndObject();
     return new Offset<RecordBatch>(o);
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Schema.cs b/csharp/src/Apache.Arrow/Flatbuf/Schema.cs
index 206a8a1983d..3764e7ba3a1 100644
--- a/csharp/src/Apache.Arrow/Flatbuf/Schema.cs
+++ b/csharp/src/Apache.Arrow/Flatbuf/Schema.cs
@@ -27,19 +27,30 @@ internal struct Schema : IFlatbufferObject
   public int FieldsLength { get { int o = __p.__offset(6); return o != 0 ? __p.__vector_len(o) : 0; } }
   public KeyValue? CustomMetadata(int j) { int o = __p.__offset(8); return o != 0 ? (KeyValue?)(new KeyValue()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; }
   public int CustomMetadataLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } }
+  /// Features used in the stream/file.
+  public Feature Features(int j) { int o = __p.__offset(10); return o != 0 ? (Feature)__p.bb.GetLong(__p.__vector(o) + j * 8) : (Feature)0; }
+  public int FeaturesLength { get { int o = __p.__offset(10); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<byte> GetFeaturesBytes() { return __p.__vector_as_span(10); }
+#else
+  public ArraySegment<byte>? GetFeaturesBytes() { return __p.__vector_as_arraysegment(10); }
+#endif
+  public Feature[] GetFeaturesArray() { return __p.__vector_as_array<Feature>(10); }
 
   public static Offset<Schema> CreateSchema(FlatBufferBuilder builder,
       Endianness endianness = Endianness.Little,
       VectorOffset fieldsOffset = default(VectorOffset),
-      VectorOffset custom_metadataOffset = default(VectorOffset)) {
-    builder.StartObject(3);
+      VectorOffset custom_metadataOffset = default(VectorOffset),
+      VectorOffset featuresOffset = default(VectorOffset)) {
+    builder.StartObject(4);
+    Schema.AddFeatures(builder, featuresOffset);
     Schema.AddCustomMetadata(builder, custom_metadataOffset);
     Schema.AddFields(builder, fieldsOffset);
     Schema.AddEndianness(builder, endianness);
     return Schema.EndSchema(builder);
   }
 
-  public static void StartSchema(FlatBufferBuilder builder) { builder.StartObject(3); }
+  public static void StartSchema(FlatBufferBuilder builder) { builder.StartObject(4); }
   public static void AddEndianness(FlatBufferBuilder builder, Endianness endianness) { builder.AddShort(0, (short)endianness, 0); }
   public static void AddFields(FlatBufferBuilder builder, VectorOffset fieldsOffset) { builder.AddOffset(1, fieldsOffset.Value, 0); }
   public static VectorOffset CreateFieldsVector(FlatBufferBuilder builder, Offset<Field>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
@@ -49,6 +60,10 @@ public static Offset<Schema> CreateSchema(FlatBufferBuilder builder,
   public static VectorOffset CreateCustomMetadataVector(FlatBufferBuilder builder, Offset<KeyValue>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
   public static VectorOffset CreateCustomMetadataVectorBlock(FlatBufferBuilder builder, Offset<KeyValue>[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
   public static void StartCustomMetadataVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static void AddFeatures(FlatBufferBuilder builder, VectorOffset featuresOffset) { builder.AddOffset(3, featuresOffset.Value, 0); }
+  public static VectorOffset CreateFeaturesVector(FlatBufferBuilder builder, Feature[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddLong((long)data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateFeaturesVectorBlock(FlatBufferBuilder builder, Feature[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartFeaturesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
   public static Offset<Schema> EndSchema(FlatBufferBuilder builder) {
     int o = builder.EndObject();
     return new Offset<Schema>(o);
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Types/Decimal.cs b/csharp/src/Apache.Arrow/Flatbuf/Types/Decimal.cs
index b661709f722..97f62b6761b 100644
--- a/csharp/src/Apache.Arrow/Flatbuf/Types/Decimal.cs
+++ b/csharp/src/Apache.Arrow/Flatbuf/Types/Decimal.cs
@@ -8,7 +8,11 @@ namespace Apache.Arrow.Flatbuf
 using global::System;
 using global::FlatBuffers;
 
-internal struct Decimal : IFlatbufferObject
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+internal struct Decimal: IFlatbufferObject
 {
   private Table __p;
   public ByteBuffer ByteBuffer { get { return __p.bb; } }
@@ -21,19 +25,25 @@ internal struct Decimal : IFlatbufferObject
   public int Precision { get { int o = __p.__offset(4); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
   /// Number of digits after the decimal point "."
   public int Scale { get { int o = __p.__offset(6); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  public int BitWidth { get { int o = __p.__offset(8); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)128; } }
 
   public static Offset<Decimal> CreateDecimal(FlatBufferBuilder builder,
       int precision = 0,
-      int scale = 0) {
-    builder.StartObject(2);
+      int scale = 0,
+      int bitWidth = 128) {
+    builder.StartObject(3);
+    Decimal.AddBitWidth(builder, bitWidth);
     Decimal.AddScale(builder, scale);
     Decimal.AddPrecision(builder, precision);
     return Decimal.EndDecimal(builder);
   }
 
-  public static void StartDecimal(FlatBufferBuilder builder) { builder.StartObject(2); }
+  public static void StartDecimal(FlatBufferBuilder builder) { builder.StartObject(3); }
   public static void AddPrecision(FlatBufferBuilder builder, int precision) { builder.AddInt(0, precision, 0); }
   public static void AddScale(FlatBufferBuilder builder, int scale) { builder.AddInt(1, scale, 0); }
+  public static void AddBitWidth(FlatBufferBuilder builder, int bitWidth) { builder.AddInt(2, bitWidth, 128); }
   public static Offset<Decimal> EndDecimal(FlatBufferBuilder builder) {
     int o = builder.EndObject();
     return new Offset<Decimal>(o);
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Types/Duration.cs b/csharp/src/Apache.Arrow/Flatbuf/Types/Duration.cs
new file mode 100644
index 00000000000..1f3a9e41095
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/Types/Duration.cs
@@ -0,0 +1,38 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+using global::System;
+using global::FlatBuffers;
+
+internal struct Duration : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static Duration GetRootAsDuration(ByteBuffer _bb) { return GetRootAsDuration(_bb, new Duration()); }
+  public static Duration GetRootAsDuration(ByteBuffer _bb, Duration obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p.bb_pos = _i; __p.bb = _bb; }
+  public Duration __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public TimeUnit Unit { get { int o = __p.__offset(4); return o != 0 ? (TimeUnit)__p.bb.GetShort(o + __p.bb_pos) : TimeUnit.MILLISECOND; } }
+
+  public static Offset<Duration> CreateDuration(FlatBufferBuilder builder,
+      TimeUnit unit = TimeUnit.MILLISECOND) {
+    builder.StartObject(1);
+    Duration.AddUnit(builder, unit);
+    return Duration.EndDuration(builder);
+  }
+
+  public static void StartDuration(FlatBufferBuilder builder) { builder.StartObject(1); }
+  public static void AddUnit(FlatBufferBuilder builder, TimeUnit unit) { builder.AddShort(0, (short)unit, 1); }
+  public static Offset<Duration> EndDuration(FlatBufferBuilder builder) {
+    int o = builder.EndObject();
+    return new Offset<Duration>(o);
+  }
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Types/LargeBinary.cs b/csharp/src/Apache.Arrow/Flatbuf/Types/LargeBinary.cs
new file mode 100644
index 00000000000..8ac6aa2cd03
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/Types/LargeBinary.cs
@@ -0,0 +1,31 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+using global::System;
+using global::FlatBuffers;
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+internal struct LargeBinary : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static LargeBinary GetRootAsLargeBinary(ByteBuffer _bb) { return GetRootAsLargeBinary(_bb, new LargeBinary()); }
+  public static LargeBinary GetRootAsLargeBinary(ByteBuffer _bb, LargeBinary obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p.bb_pos = _i; __p.bb = _bb; }
+  public LargeBinary __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+
+  public static void StartLargeBinary(FlatBufferBuilder builder) { builder.StartObject(0); }
+  public static Offset<LargeBinary> EndLargeBinary(FlatBufferBuilder builder) {
+    int o = builder.EndObject();
+    return new Offset<LargeBinary>(o);
+  }
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Types/LargeList.cs b/csharp/src/Apache.Arrow/Flatbuf/Types/LargeList.cs
new file mode 100644
index 00000000000..49e69a87f9a
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/Types/LargeList.cs
@@ -0,0 +1,31 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+using global::System;
+using global::FlatBuffers;
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+internal struct LargeList : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static LargeList GetRootAsLargeList(ByteBuffer _bb) { return GetRootAsLargeList(_bb, new LargeList()); }
+  public static LargeList GetRootAsLargeList(ByteBuffer _bb, LargeList obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p.bb_pos = _i; __p.bb = _bb; }
+  public LargeList __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+
+  public static void StartLargeList(FlatBufferBuilder builder) { builder.StartObject(0); }
+  public static Offset<LargeList> EndLargeList(FlatBufferBuilder builder) {
+    int o = builder.EndObject();
+    return new Offset<LargeList>(o);
+  }
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Flatbuf/Types/LargeUtf8.cs b/csharp/src/Apache.Arrow/Flatbuf/Types/LargeUtf8.cs
new file mode 100644
index 00000000000..ebb64390ff4
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Flatbuf/Types/LargeUtf8.cs
@@ -0,0 +1,31 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace Apache.Arrow.Flatbuf
+{
+
+using global::System;
+using global::FlatBuffers;
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+internal struct LargeUtf8 : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static LargeUtf8 GetRootAsLargeUtf8(ByteBuffer _bb) { return GetRootAsLargeUtf8(_bb, new LargeUtf8()); }
+  public static LargeUtf8 GetRootAsLargeUtf8(ByteBuffer _bb, LargeUtf8 obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p.bb_pos = _i; __p.bb = _bb; }
+  public LargeUtf8 __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+
+  public static void StartLargeUtf8(FlatBufferBuilder builder) { builder.StartObject(0); }
+  public static Offset<LargeUtf8> EndLargeUtf8(FlatBufferBuilder builder) {
+    int o = builder.EndObject();
+    return new Offset<LargeUtf8>(o);
+  }
+};
+
+
+}
diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index 3ad6ed25d95..5f0d16f8306 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -46,7 +46,9 @@ internal class ArrowRecordBatchFlatBufferBuilder :
             IArrowArrayVisitor<ListArray>,
             IArrowArrayVisitor<StringArray>,
             IArrowArrayVisitor<BinaryArray>,
-            IArrowArrayVisitor<StructArray>
+            IArrowArrayVisitor<StructArray>,
+            IArrowArrayVisitor<Decimal128Array>,
+            IArrowArrayVisitor<Decimal256Array>
         {
             public readonly struct Buffer
             {
@@ -104,6 +106,18 @@ public void Visit(BinaryArray array)
                 _buffers.Add(CreateBuffer(array.ValueBuffer));
             }
 
+            public void Visit(Decimal128Array array)
+            {
+                _buffers.Add(CreateBuffer(array.NullBitmapBuffer));
+                _buffers.Add(CreateBuffer(array.ValueBuffer));
+            }
+
+            public void Visit(Decimal256Array array)
+            {
+                _buffers.Add(CreateBuffer(array.NullBitmapBuffer));
+                _buffers.Add(CreateBuffer(array.ValueBuffer));
+            }
+
             public void Visit(StructArray array)
             {
                 _buffers.Add(CreateBuffer(array.NullBitmapBuffer));
diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
index b331e89b115..d0d2b74e701 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
@@ -61,7 +61,9 @@ class TypeVisitor :
             IArrowTypeVisitor<TimestampType>,
             IArrowTypeVisitor<ListType>,
             IArrowTypeVisitor<UnionType>,
-            IArrowTypeVisitor<StructType>
+            IArrowTypeVisitor<StructType>,
+            IArrowTypeVisitor<Decimal128Type>,
+            IArrowTypeVisitor<Decimal256Type>
         {
             private FlatBufferBuilder Builder { get; }
 
@@ -178,6 +180,20 @@ public void Visit(StructType type)
                 Result = FieldType.Build(Flatbuf.Type.Struct_, Flatbuf.Struct_.EndStruct_(Builder));
             }
 
+            public void Visit(Decimal128Type type)
+            {
+                Result = FieldType.Build(
+                    Flatbuf.Type.Decimal,
+                    Flatbuf.Decimal.CreateDecimal(Builder, type.Precision, type.Scale, type.BitWidth));
+            }
+
+            public void Visit(Decimal256Type type)
+            {
+                Result = FieldType.Build(
+                    Flatbuf.Type.Decimal,
+                    Flatbuf.Decimal.CreateDecimal(Builder, type.Precision, type.Scale, type.BitWidth));
+            }
+
             private void CreateIntType(NumberType type)
             {
                 Result = FieldType.Build(
diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
index b39e7498bf3..a4e76608924 100644
--- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
+++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
@@ -117,7 +117,15 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c
                     return new Types.BooleanType();
                 case Flatbuf.Type.Decimal:
                     Flatbuf.Decimal decMeta = field.Type<Flatbuf.Decimal>().Value;
-                    return new Types.DecimalType(decMeta.Precision, decMeta.Scale);
+                    switch (decMeta.BitWidth)
+                    {
+                        case 128:
+                            return new Types.Decimal128Type(decMeta.Precision, decMeta.Scale);
+                        case 256:
+                            return new Types.Decimal256Type(decMeta.Precision, decMeta.Scale);
+                        default:
+                            throw new InvalidDataException("Unsupported decimal bit width " + decMeta.BitWidth);
+                    }
                 case Flatbuf.Type.Date:
                     Flatbuf.Date dateMeta = field.Type<Flatbuf.Date>().Value;
                     switch (dateMeta.Unit)
diff --git a/csharp/src/Apache.Arrow/RecordBatch.Builder.cs b/csharp/src/Apache.Arrow/RecordBatch.Builder.cs
index a11a9975038..c50bf1ace41 100644
--- a/csharp/src/Apache.Arrow/RecordBatch.Builder.cs
+++ b/csharp/src/Apache.Arrow/RecordBatch.Builder.cs
@@ -44,6 +44,12 @@ internal ArrayBuilder(MemoryAllocator allocator)
             public UInt64Array UInt64(Action<UInt64Array.Builder> action) => Build<UInt64Array, UInt64Array.Builder>(new UInt64Array.Builder(), action);
             public FloatArray Float(Action<FloatArray.Builder> action) => Build<FloatArray, FloatArray.Builder>(new FloatArray.Builder(), action);
             public DoubleArray Double(Action<DoubleArray.Builder> action) => Build<DoubleArray, DoubleArray.Builder>(new DoubleArray.Builder(), action);
+            public Decimal128Array Decimal128(Decimal128Type type, Action<Decimal128Array.Builder> action) =>
+                Build<Decimal128Array, Decimal128Array.Builder>(
+                    new Decimal128Array.Builder(type), action);
+            public Decimal256Array Decimal256(Decimal256Type type, Action<Decimal256Array.Builder> action) =>
+                Build<Decimal256Array, Decimal256Array.Builder>(
+                    new Decimal256Array.Builder(type), action);
             public Date32Array Date32(Action<Date32Array.Builder> action) => Build<Date32Array, Date32Array.Builder>(new Date32Array.Builder(), action);
             public Date64Array Date64(Action<Date64Array.Builder> action) => Build<Date64Array, Date64Array.Builder>(new Date64Array.Builder(), action);
             public BinaryArray Binary(Action<BinaryArray.Builder> action) => Build<BinaryArray, BinaryArray.Builder>(new BinaryArray.Builder(), action);
diff --git a/csharp/src/Apache.Arrow/Types/DecimalType.cs b/csharp/src/Apache.Arrow/Types/Decimal128Type.cs
similarity index 80%
rename from csharp/src/Apache.Arrow/Types/DecimalType.cs
rename to csharp/src/Apache.Arrow/Types/Decimal128Type.cs
index cad2e242872..e00b9da10ea 100644
--- a/csharp/src/Apache.Arrow/Types/DecimalType.cs
+++ b/csharp/src/Apache.Arrow/Types/Decimal128Type.cs
@@ -15,19 +15,21 @@
 
 namespace Apache.Arrow.Types
 {
-    public sealed class DecimalType: FixedSizeBinaryType
+    public sealed class Decimal128Type : FixedSizeBinaryType
     {
-        public override ArrowTypeId TypeId => ArrowTypeId.Decimal;
-        public override string Name => "decimal";
+        public override ArrowTypeId TypeId => ArrowTypeId.Decimal128;
+        public override string Name => "decimal128";
 
         public int Precision { get; }
         public int Scale { get; }
 
-        public DecimalType(int precision, int scale)
+        public Decimal128Type(int precision, int scale)
             : base(16)
         {
             Precision = precision;
             Scale = scale;
         }
+
+        public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor);
     }
 }
diff --git a/csharp/src/Apache.Arrow/Types/Decimal256Type.cs b/csharp/src/Apache.Arrow/Types/Decimal256Type.cs
new file mode 100644
index 00000000000..b184deb3cde
--- /dev/null
+++ b/csharp/src/Apache.Arrow/Types/Decimal256Type.cs
@@ -0,0 +1,35 @@
+﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace Apache.Arrow.Types
+{
+    public sealed class Decimal256Type: FixedSizeBinaryType
+    {
+        public override ArrowTypeId TypeId => ArrowTypeId.Decimal256;
+        public override string Name => "decimal256";
+
+        public int Precision { get; }
+        public int Scale { get; }
+
+        public Decimal256Type(int precision, int scale)
+            : base(32)
+        {
+            Precision = precision;
+            Scale = scale;
+        }
+
+        public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor);
+    }
+}
diff --git a/csharp/src/Apache.Arrow/Types/IArrowType.cs b/csharp/src/Apache.Arrow/Types/IArrowType.cs
index 578e18b9ded..15c9a024451 100644
--- a/csharp/src/Apache.Arrow/Types/IArrowType.cs
+++ b/csharp/src/Apache.Arrow/Types/IArrowType.cs
@@ -40,7 +40,8 @@ public enum ArrowTypeId
         Time32,
         Time64,
         Interval,
-        Decimal,
+        Decimal128,
+        Decimal256,
         List,
         Struct,
         Union,
diff --git a/csharp/test/Apache.Arrow.Benchmarks/ArrowReaderBenchmark.cs b/csharp/test/Apache.Arrow.Benchmarks/ArrowReaderBenchmark.cs
index 9c1bd2ff302..4e491a2a6b1 100644
--- a/csharp/test/Apache.Arrow.Benchmarks/ArrowReaderBenchmark.cs
+++ b/csharp/test/Apache.Arrow.Benchmarks/ArrowReaderBenchmark.cs
@@ -1,4 +1,4 @@
-﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// Licensed to the Apache Software Foundation (ASF) under one or more
 // contributor license agreements. See the NOTICE file distributed with
 // this work for additional information regarding copyright ownership.
 // The ASF licenses this file to You under the Apache License, Version 2.0
@@ -116,6 +116,10 @@ private static double SumAllNumbers(RecordBatch recordBatch)
                         DoubleArray doubleArray = (DoubleArray)array;
                         sum += Sum(doubleArray);
                         break;
+                    case ArrowTypeId.Decimal128:
+                        Decimal128Array decimalArray = (Decimal128Array)array;
+                        sum += Sum(decimalArray);
+                        break;
                 }
             }
             return sum;
@@ -142,5 +146,15 @@ private static long Sum(Int64Array int64Array)
             }
             return sum;
         }
+
+        private static double Sum(Decimal128Array decimal128Array)
+        {
+            double sum = 0;
+            for (int valueIndex = 0; valueIndex < decimal128Array.Length; valueIndex++)
+            {
+                sum += (double)decimal128Array.GetValue(valueIndex);
+            }
+            return sum;
+        }
     }
 }
diff --git a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs
index e2f52edf011..f35c2a5d78d 100644
--- a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs
+++ b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs
@@ -28,8 +28,8 @@ public class ArrowWriterBenchmark
         [Params(10_000, 1_000_000)]
         public int BatchLength{ get; set; }
 
-        //Max column set count is 21 before reaching 2gb limit of memory stream
-        [Params(10, 20)]
+        //Max column set count is 15 before reaching 2gb limit of memory stream
+        [Params(10, 14)]
         public int ColumnSetCount { get; set; }
 
         private MemoryStream _memoryStream;
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index b1c4653e908..78f51a7459c 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -18,6 +18,7 @@
 using System;
 using System.Linq;
 using System.Threading.Tasks;
+using Apache.Arrow.Arrays;
 using Xunit;
 
 namespace Apache.Arrow.Tests
@@ -77,7 +78,9 @@ private class ArrayComparer :
             IArrowArrayVisitor<ListArray>,
             IArrowArrayVisitor<StringArray>,
             IArrowArrayVisitor<BinaryArray>,
-            IArrowArrayVisitor<StructArray>
+            IArrowArrayVisitor<StructArray>,
+            IArrowArrayVisitor<Decimal128Array>,
+            IArrowArrayVisitor<Decimal256Array>
         {
             private readonly IArrowArray _expectedArray;
             private readonly ArrayTypeComparer _arrayTypeComparer;
@@ -103,7 +106,8 @@ public ArrayComparer(IArrowArray expectedArray)
             public void Visit(Date32Array array) => CompareArrays(array);
             public void Visit(Date64Array array) => CompareArrays(array);
             public void Visit(ListArray array) => CompareArrays(array);
-
+            public void Visit(Decimal128Array array) => CompareArrays(array);
+            public void Visit(Decimal256Array array) => CompareArrays(array);
             public void Visit(StringArray array) => CompareBinaryArrays<StringArray>(array);
 
             public void Visit(BinaryArray array) => CompareBinaryArrays<BinaryArray>(array);
@@ -147,6 +151,23 @@ private void CompareBinaryArrays<T>(BinaryArray actualArray)
                 Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length)));
             }
 
+            private void CompareArrays(FixedSizeBinaryArray actualArray)
+            {
+                Assert.IsAssignableFrom<FixedSizeBinaryArray>(_expectedArray);
+                Assert.IsAssignableFrom<FixedSizeBinaryArray>(actualArray);
+
+                var expectedArray = (FixedSizeBinaryArray)_expectedArray;
+
+                actualArray.Data.DataType.Accept(_arrayTypeComparer);
+
+                Assert.Equal(expectedArray.Length, actualArray.Length);
+                Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
+                Assert.Equal(expectedArray.Offset, actualArray.Offset);
+
+                Assert.True(expectedArray.NullBitmapBuffer.Span.SequenceEqual(actualArray.NullBitmapBuffer.Span));
+                Assert.True(expectedArray.ValueBuffer.Span.Slice(0, expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0, actualArray.Length)));
+            }
+
             private void CompareArrays<T>(PrimitiveArray<T> actualArray)
                 where T : struct, IEquatable<T>
             {
diff --git a/csharp/test/Apache.Arrow.Tests/Decimal128ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Decimal128ArrayTests.cs
new file mode 100644
index 00000000000..68f8ee02b12
--- /dev/null
+++ b/csharp/test/Apache.Arrow.Tests/Decimal128ArrayTests.cs
@@ -0,0 +1,241 @@
+﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using Apache.Arrow.Types;
+using Xunit;
+
+namespace Apache.Arrow.Tests
+{
+    public class Decimal128ArrayTests
+    {
+        public class Builder
+        {
+            public class AppendNull
+            {
+                [Fact]
+                public void AppendThenGetGivesNull()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(8,2));
+
+                    // Act
+
+                    builder = builder.AppendNull();
+                    builder = builder.AppendNull();
+                    builder = builder.AppendNull();
+                    // Assert
+                    var array = builder.Build();
+
+                    Assert.Equal(3, array.Length);
+                    Assert.Equal(array.Data.Buffers[1].Length, array.ByteWidth * 3);
+                    Assert.Null(array.GetValue(0));
+                    Assert.Null(array.GetValue(1));
+                    Assert.Null(array.GetValue(2));
+                }
+            }
+
+            public class Append
+            {
+                [Theory]
+                [InlineData(200)]
+                public void AppendDecimal(int count)
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(14, 10));
+
+                    // Act
+                    decimal?[] testData = new decimal?[count];
+                    for (int i = 0; i < count; i++)
+                    {
+                        if (i == count - 2)
+                        {
+                            builder.AppendNull();
+                            testData[i] = null;
+                            continue;
+                        }
+                        decimal rnd = i * (decimal)Math.Round(new Random().NextDouble(),10);
+                        testData[i] = rnd;
+                        builder.Append(rnd);
+                    }
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(count, array.Length);
+                    for (int i = 0; i < count; i++)
+                    {
+                        Assert.Equal(testData[i], array.GetValue(i));
+                    }
+                }
+
+                [Fact]
+                public void AppendLargeDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(26, 2));
+                    decimal large = 999999999999909999999999.80M;
+                    // Act
+                    builder.Append(large);
+                    builder.Append(-large);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(large, array.GetValue(0));
+                    Assert.Equal(-large, array.GetValue(1));
+                }
+
+                [Fact]
+                public void AppendFractionalDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(26, 20));
+                    decimal fraction = 0.99999999999990999992M;
+                    // Act
+                    builder.Append(fraction);
+                    builder.Append(-fraction);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(fraction, array.GetValue(0));
+                    Assert.Equal(-fraction, array.GetValue(1));
+                }
+
+                [Fact]
+                public void AppendRangeDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(24, 8));
+                    var range = new decimal[] {2.123M, 1.5984M, -0.0000001M, 9878987987987987.1235407M};
+
+                    // Act
+                    builder.AppendRange(range);
+                    builder.AppendNull();
+
+                    // Assert
+                    var array = builder.Build();
+                    for(int i = 0; i < range.Length; i ++)
+                    {
+                        Assert.Equal(range[i], array.GetValue(i));
+                    }
+                    
+                    Assert.Null( array.GetValue(range.Length));
+                }
+
+                [Fact]
+                public void AppendClearAppendDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(24, 8));
+                    
+                    // Act
+                    builder.Append(1);
+                    builder.Clear();
+                    builder.Append(10);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(10, array.GetValue(0));
+                }
+
+                [Fact]
+                public void AppendInvalidPrecisionAndScaleDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(2, 1));
+
+                    // Assert
+                    Assert.Throws<OverflowException>(() => builder.Append(100));
+                    Assert.Throws<OverflowException>(() => builder.Append(0.01M));
+                    builder.Append(-9.9M);
+                    builder.Append(0);
+                    builder.Append(9.9M);
+                }
+            }
+
+            public class Set
+            {
+                [Fact]
+                public void SetDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(24, 8))
+                        .Resize(1);
+
+                    // Act
+                    builder.Set(0, 50.123456M);
+                    builder.Set(0, 1.01M);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(1.01M, array.GetValue(0));
+                }
+
+                [Fact]
+                public void SetNull()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(24, 8))
+                        .Resize(1);
+
+                    // Act
+                    builder.Set(0, 50.123456M);
+                    builder.SetNull(0);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Null(array.GetValue(0));
+                }
+            }
+
+            public class Swap
+            {
+                [Fact]
+                public void SetDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(24, 8));
+
+                    // Act
+                    builder.Append(123.45M);
+                    builder.Append(678.9M);
+                    builder.Swap(0, 1);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(678.9M, array.GetValue(0));
+                    Assert.Equal(123.45M, array.GetValue(1));
+                }
+
+                [Fact]
+                public void SwapNull()
+                {
+                    // Arrange
+                    var builder = new Decimal128Array.Builder(new Decimal128Type(24, 8));
+
+                    // Act
+                    builder.Append(123.456M);
+                    builder.AppendNull();
+                    builder.Swap(0, 1);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Null(array.GetValue(0));
+                    Assert.Equal(123.456M, array.GetValue(1));
+                }
+            }
+        }
+    }
+}
diff --git a/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs
new file mode 100644
index 00000000000..35b68823d95
--- /dev/null
+++ b/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs
@@ -0,0 +1,241 @@
+﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using Apache.Arrow.Types;
+using Xunit;
+
+namespace Apache.Arrow.Tests
+{
+    public class Decimal256ArrayTests
+    {
+        public class Builder
+        {
+            public class AppendNull
+            {
+                [Fact]
+                public void AppendThenGetGivesNull()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(8,2));
+
+                    // Act
+
+                    builder = builder.AppendNull();
+                    builder = builder.AppendNull();
+                    builder = builder.AppendNull();
+                    // Assert
+                    var array = builder.Build();
+
+                    Assert.Equal(3, array.Length);
+                    Assert.Equal(array.Data.Buffers[1].Length, array.ByteWidth * 3);
+                    Assert.Null(array.GetValue(0));
+                    Assert.Null(array.GetValue(1));
+                    Assert.Null(array.GetValue(2));
+                }
+            }
+
+            public class Append
+            {
+                [Theory]
+                [InlineData(200)]
+                public void AppendDecimal(int count)
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(14, 10));
+
+                    // Act
+                    decimal?[] testData = new decimal?[count];
+                    for (int i = 0; i < count; i++)
+                    {
+                        if (i == count - 2)
+                        {
+                            builder.AppendNull();
+                            testData[i] = null;
+                            continue;
+                        }
+                        decimal rnd = i * (decimal)Math.Round(new Random().NextDouble(),10);
+                        testData[i] = rnd;
+                        builder.Append(rnd);
+                    }
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(count, array.Length);
+                    for (int i = 0; i < count; i++)
+                    {
+                        Assert.Equal(testData[i], array.GetValue(i));
+                    }
+                }
+
+                [Fact]
+                public void AppendLargeDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(26, 2));
+                    decimal large = 999999999999909999999999.80M;
+                    // Act
+                    builder.Append(large);
+                    builder.Append(-large);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(large, array.GetValue(0));
+                    Assert.Equal(-large, array.GetValue(1));
+                }
+
+                [Fact]
+                public void AppendFractionalDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(26, 20));
+                    decimal fraction = 0.99999999999990999992M;
+                    // Act
+                    builder.Append(fraction);
+                    builder.Append(-fraction);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(fraction, array.GetValue(0));
+                    Assert.Equal(-fraction, array.GetValue(1));
+                }
+
+                [Fact]
+                public void AppendRangeDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(24, 8));
+                    var range = new decimal[] {2.123M, 1.5984M, -0.0000001M, 9878987987987987.1235407M};
+
+                    // Act
+                    builder.AppendRange(range);
+                    builder.AppendNull();
+
+                    // Assert
+                    var array = builder.Build();
+                    for(int i = 0; i < range.Length; i ++)
+                    {
+                        Assert.Equal(range[i], array.GetValue(i));
+                    }
+                    
+                    Assert.Null( array.GetValue(range.Length));
+                }
+
+                [Fact]
+                public void AppendClearAppendDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(24, 8));
+                    
+                    // Act
+                    builder.Append(1);
+                    builder.Clear();
+                    builder.Append(10);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(10, array.GetValue(0));
+                }
+
+                [Fact]
+                public void AppendInvalidPrecisionAndScaleDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(2, 1));
+
+                    // Assert
+                    Assert.Throws<OverflowException>(() => builder.Append(100));
+                    Assert.Throws<OverflowException>(() => builder.Append(0.01M));
+                    builder.Append(-9.9M);
+                    builder.Append(0);
+                    builder.Append(9.9M);
+                }
+            }
+
+            public class Set
+            {
+                [Fact]
+                public void SetDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(24, 8))
+                        .Resize(1);
+                    
+                    // Act
+                    builder.Set(0, 50.123456M);
+                    builder.Set(0, 1.01M);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(1.01M, array.GetValue(0));
+                }
+
+                [Fact]
+                public void SetNull()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(24, 8))
+                        .Resize(1);
+
+                    // Act
+                    builder.Set(0, 50.123456M);
+                    builder.SetNull(0);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Null(array.GetValue(0));
+                }
+            }
+
+            public class Swap
+            {
+                [Fact]
+                public void SetDecimal()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(24, 8));
+
+                    // Act
+                    builder.Append(123.45M);
+                    builder.Append(678.9M);
+                    builder.Swap(0, 1);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Equal(678.9M, array.GetValue(0));
+                    Assert.Equal(123.45M, array.GetValue(1));
+                }
+
+                [Fact]
+                public void SwapNull()
+                {
+                    // Arrange
+                    var builder = new Decimal256Array.Builder(new Decimal256Type(24, 8));
+
+                    // Act
+                    builder.Append(123.456M);
+                    builder.AppendNull();
+                    builder.Swap(0, 1);
+
+                    // Assert
+                    var array = builder.Build();
+                    Assert.Null(array.GetValue(0));
+                    Assert.Equal(123.456M, array.GetValue(1));
+                }
+            }
+        }
+    }
+}
diff --git a/csharp/test/Apache.Arrow.Tests/DecimalUtilityTests.cs b/csharp/test/Apache.Arrow.Tests/DecimalUtilityTests.cs
new file mode 100644
index 00000000000..d235524d911
--- /dev/null
+++ b/csharp/test/Apache.Arrow.Tests/DecimalUtilityTests.cs
@@ -0,0 +1,51 @@
+﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using Apache.Arrow.Types;
+using Xunit;
+
+namespace Apache.Arrow.Tests
+{
+    public class DecimalUtilityTests
+    {
+        public class Overflow
+        {
+            [Theory]
+            [InlineData(100.123, 10, 4, false)]
+            [InlineData(100.123, 6, 4, false)]
+            [InlineData(100.123, 3, 3, true)]
+            [InlineData(100.123, 10, 2, true)]
+            [InlineData(100.123, 5, 2, true)]
+            [InlineData(100.123, 5, 3, true)]
+            [InlineData(100.123, 6, 3, false)]
+            public void HasExpectedResultOrThrows(decimal d, int precision , int scale, bool shouldThrow)
+            {
+                var builder = new Decimal128Array.Builder(new Decimal128Type(precision, scale));
+
+                if (shouldThrow)
+                {
+                   Assert.Throws<OverflowException>(() => builder.Append(d));
+                }
+                else
+                {
+                    builder.Append(d);
+                    var result = builder.Build(new TestMemoryAllocator());
+                    Assert.Equal(d, result.GetValue(0));
+                }
+            }
+        }
+    }
+}
diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs
index 5688c6260d9..8db2241ff89 100644
--- a/csharp/test/Apache.Arrow.Tests/TestData.cs
+++ b/csharp/test/Apache.Arrow.Tests/TestData.cs
@@ -48,8 +48,9 @@ public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount
                 builder.Field(CreateField(TimestampType.Default, i));
                 builder.Field(CreateField(StringType.Default, i));
                 builder.Field(CreateField(new StructType(new List<Field> { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i));
+                builder.Field(CreateField(new Decimal128Type(10, 6), i));
+                builder.Field(CreateField(new Decimal256Type(16, 8), i));
                 //builder.Field(CreateField(new FixedSizeBinaryType(16), i));
-                //builder.Field(CreateField(new DecimalType(19, 2)));
                 //builder.Field(CreateField(HalfFloatType.Default));
                 //builder.Field(CreateField(StringType.Default));
                 //builder.Field(CreateField(Time32Type.Default));
@@ -111,7 +112,9 @@ private class ArrayCreator :
             IArrowTypeVisitor<TimestampType>,
             IArrowTypeVisitor<StringType>,
             IArrowTypeVisitor<ListType>,
-            IArrowTypeVisitor<StructType>
+            IArrowTypeVisitor<StructType>,
+            IArrowTypeVisitor<Decimal128Type>,
+            IArrowTypeVisitor<Decimal256Type>
         {
             private int Length { get; }
             public IArrowArray Array { get; private set; }
@@ -132,6 +135,29 @@ public ArrayCreator(int length)
             public void Visit(UInt64Type type) => GenerateArray(new UInt64Array.Builder(), x => (ulong)x);
             public void Visit(FloatType type) => GenerateArray(new FloatArray.Builder(), x => ((float)x / Length));
             public void Visit(DoubleType type) => GenerateArray(new DoubleArray.Builder(), x => ((double)x / Length));
+            public void Visit(Decimal128Type type)
+            {
+                var builder = new Decimal128Array.Builder(type).Reserve(Length);
+
+                for (var i = 0; i < Length; i++)
+                {
+                    builder.Append((decimal)i / Length);
+                }
+
+                Array = builder.Build();
+            }
+
+            public void Visit(Decimal256Type type)
+            {
+                var builder = new Decimal256Array.Builder(type).Reserve(Length);
+
+                for (var i = 0; i < Length; i++)
+                {
+                    builder.Append((decimal)i / Length);
+                }
+
+                Array = builder.Build();
+            }
 
             public void Visit(Date32Type type)
             {
diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 819f978a862..87140b21fe9 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -137,8 +137,13 @@ def _gold_tests(self, gold_dir):
                 skip.add("Go")
                 skip.add("JS")
                 skip.add("Rust")
-            if name == 'zstd':
-                skip.add("Java")
+
+            # See https://github.com/apache/arrow/pull/9822 for how to
+            # disable specific compression type tests.
+
+            if prefix == '4.0.0-shareddict':
+                skip.add("Go")
+
             yield datagen.File(name, None, None, skip=skip, path=out_path)
 
     def _run_test_cases(self, producer, consumer, case_runner,
diff --git a/dev/archery/archery/release.py b/dev/archery/archery/release.py
index a6b9ecc4225..acfe3fc2373 100644
--- a/dev/archery/archery/release.py
+++ b/dev/archery/archery/release.py
@@ -37,18 +37,21 @@ def cached_property(fn):
 
 class Version(SemVer):
 
-    __slots__ = SemVer.__slots__ + ('released', 'release_date')
+    __slots__ = ('released', 'release_date')
 
-    def __init__(self, version_string, released=False, release_date=None):
-        semver = SemVer.parse(version_string)
-        super().__init__(**semver.to_dict())
+    def __init__(self, released=False, release_date=None, **kwargs):
+        super().__init__(**kwargs)
         self.released = released
         self.release_date = release_date
 
+    @classmethod
+    def parse(cls, version, **kwargs):
+        return cls(**SemVer.parse(version).to_dict(), **kwargs)
+
     @classmethod
     def from_jira(cls, jira_version):
-        return cls(
-            version_string=jira_version.name,
+        return cls.parse(
+            jira_version.name,
             released=jira_version.released,
             release_date=getattr(jira_version, 'releaseDate', None)
         )
diff --git a/dev/archery/archery/tests/test_benchmarks.py b/dev/archery/archery/tests/test_benchmarks.py
index 52be8136c5d..dffe698d41d 100644
--- a/dev/archery/archery/tests/test_benchmarks.py
+++ b/dev/archery/archery/tests/test_benchmarks.py
@@ -52,8 +52,7 @@ def test_benchmark_comparator():
     ).regression
 
 
-def test_static_runner_from_json():
-    # full output of `archery benchmark run`
+def test_static_runner_from_json_not_a_regression():
     archery_result = {
         "suites": [
             {
@@ -71,18 +70,35 @@ def test_static_runner_from_json():
                             9095.800104330105
                         ]
                     },
+                ]
+            }
+        ]
+    }
+
+    contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
+    baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
+    [comparison] = RunnerComparator(contender, baseline).comparisons
+    assert not comparison.regression
+
+
+def test_static_runner_from_json_regression():
+    archery_result = {
+        "suites": [
+            {
+                "name": "arrow-value-parsing-benchmark",
+                "benchmarks": [
                     {
-                        "name": "FloatParsing<FloatType>",
+                        "name": "FloatParsing<DoubleType>",
                         "unit": "items_per_second",
                         "less_is_better": False,
                         "values": [
-                            105982641.9337845
+                            109941112.87296811
                         ],
                         "time_unit": "ns",
                         "times": [
-                            9435.567922160235
+                            9095.800104330105
                         ]
-                    }
+                    },
                 ]
             }
         ]
@@ -90,14 +106,12 @@ def test_static_runner_from_json():
 
     contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
 
-    # introduce artificial regression:
+    # introduce artificial regression
     archery_result['suites'][0]['benchmarks'][0]['values'][0] *= 2
     baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
 
-    artificial_reg, normal = RunnerComparator(contender, baseline).comparisons
-
-    assert artificial_reg.regression
-    assert not normal.regression
+    [comparison] = RunnerComparator(contender, baseline).comparisons
+    assert comparison.regression
 
 
 def test_benchmark_median():
diff --git a/dev/archery/archery/tests/test_release.py b/dev/archery/archery/tests/test_release.py
index ed7732df480..75aac892123 100644
--- a/dev/archery/archery/tests/test_release.py
+++ b/dev/archery/archery/tests/test_release.py
@@ -69,17 +69,17 @@ def __init__(self):
 
     def project_versions(self, project='ARROW'):
         return [
-            Version("3.0.0", released=False),
-            Version("2.0.0", released=False),
-            Version("1.1.0", released=False),
-            Version("1.0.1", released=False),
-            Version("1.0.0", released=True),
-            Version("0.17.1", released=True),
-            Version("0.17.0", released=True),
-            Version("0.16.0", released=True),
-            Version("0.15.2", released=True),
-            Version("0.15.1", released=True),
-            Version("0.15.0", released=True),
+            Version.parse("3.0.0", released=False),
+            Version.parse("2.0.0", released=False),
+            Version.parse("1.1.0", released=False),
+            Version.parse("1.0.1", released=False),
+            Version.parse("1.0.0", released=True),
+            Version.parse("0.17.1", released=True),
+            Version.parse("0.17.0", released=True),
+            Version.parse("0.16.0", released=True),
+            Version.parse("0.15.2", released=True),
+            Version.parse("0.15.1", released=True),
+            Version.parse("0.15.0", released=True),
         ]
 
     def project_issues(self, version, project='ARROW'):
@@ -92,7 +92,7 @@ def fake_jira():
 
 
 def test_version(fake_jira):
-    v = Version("1.2.5")
+    v = Version.parse("1.2.5")
     assert str(v) == "1.2.5"
     assert v.major == 1
     assert v.minor == 2
@@ -100,7 +100,7 @@ def test_version(fake_jira):
     assert v.released is False
     assert v.release_date is None
 
-    v = Version("1.0.0", released=True, release_date="2020-01-01")
+    v = Version.parse("1.0.0", released=True, release_date="2020-01-01")
     assert str(v) == "1.0.0"
     assert v.major == 1
     assert v.minor == 0
@@ -228,43 +228,43 @@ def test_release_basics(fake_jira):
 def test_previous_and_next_release(fake_jira):
     r = Release.from_jira("3.0.0", jira=fake_jira)
     assert isinstance(r.previous, MajorRelease)
-    assert r.previous.version == Version("2.0.0")
+    assert r.previous.version == Version.parse("2.0.0")
     with pytest.raises(ValueError, match="There is no upcoming release set"):
         assert r.next
 
     r = Release.from_jira("2.0.0", jira=fake_jira)
     assert isinstance(r.previous, MajorRelease)
     assert isinstance(r.next, MajorRelease)
-    assert r.previous.version == Version("1.0.0")
-    assert r.next.version == Version("3.0.0")
+    assert r.previous.version == Version.parse("1.0.0")
+    assert r.next.version == Version.parse("3.0.0")
 
     r = Release.from_jira("1.1.0", jira=fake_jira)
     assert isinstance(r.previous, MajorRelease)
     assert isinstance(r.next, MajorRelease)
-    assert r.previous.version == Version("1.0.0")
-    assert r.next.version == Version("2.0.0")
+    assert r.previous.version == Version.parse("1.0.0")
+    assert r.next.version == Version.parse("2.0.0")
 
     r = Release.from_jira("1.0.0", jira=fake_jira)
     assert isinstance(r.next, MajorRelease)
     assert isinstance(r.previous, MajorRelease)
-    assert r.previous.version == Version("0.17.0")
-    assert r.next.version == Version("2.0.0")
+    assert r.previous.version == Version.parse("0.17.0")
+    assert r.next.version == Version.parse("2.0.0")
 
     r = Release.from_jira("0.17.0", jira=fake_jira)
     assert isinstance(r.previous, MajorRelease)
-    assert r.previous.version == Version("0.16.0")
+    assert r.previous.version == Version.parse("0.16.0")
 
     r = Release.from_jira("0.15.2", jira=fake_jira)
     assert isinstance(r.previous, PatchRelease)
     assert isinstance(r.next, MajorRelease)
-    assert r.previous.version == Version("0.15.1")
-    assert r.next.version == Version("0.16.0")
+    assert r.previous.version == Version.parse("0.15.1")
+    assert r.next.version == Version.parse("0.16.0")
 
     r = Release.from_jira("0.15.1", jira=fake_jira)
     assert isinstance(r.previous, MajorRelease)
     assert isinstance(r.next, PatchRelease)
-    assert r.previous.version == Version("0.15.0")
-    assert r.next.version == Version("0.15.2")
+    assert r.previous.version == Version.parse("0.15.0")
+    assert r.next.version == Version.parse("0.15.2")
 
 
 def test_release_issues(fake_jira):
diff --git a/dev/release/00-prepare-test.rb b/dev/release/00-prepare-test.rb
index bd1cee868d9..53bd5e89bf2 100644
--- a/dev/release/00-prepare-test.rb
+++ b/dev/release/00-prepare-test.rb
@@ -130,13 +130,6 @@ def test_version_pre_tag
     omit_on_release_branch
     prepare("VERSION_PRE_TAG")
     assert_equal([
-                   {
-                     path: "c_glib/configure.ac",
-                     hunks: [
-                       ["-m4_define([arrow_glib_version], #{@snapshot_version})",
-                        "+m4_define([arrow_glib_version], #{@release_version})"],
-                     ],
-                   },
                    {
                      path: "c_glib/meson.build",
                      hunks: [
@@ -295,6 +288,13 @@ def test_version_pre_tag
                         "+version = \"#{@release_version}\""],
                      ],
                    },
+                   {
+                    path: "rust/datafusion-examples/Cargo.toml",
+                    hunks: [
+                      ["-version = \"#{@snapshot_version}\"",
+                       "+version = \"#{@release_version}\""],
+                    ],
+                   },
                    {
                      path: "rust/datafusion/Cargo.toml",
                      hunks: [
@@ -304,8 +304,6 @@ def test_version_pre_tag
                         "-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\", features = [\"arrow\"] }",
                         "+arrow = { path = \"../arrow\", version = \"#{@release_version}\", features = [\"prettyprint\"] }",
                         "+parquet = { path = \"../parquet\", version = \"#{@release_version}\", features = [\"arrow\"] }"],
-                       ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@snapshot_version}\" }",
-                        "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }"]
                      ],
                    },
                    {
@@ -383,13 +381,6 @@ def test_version_post_tag
               "VERSION_POST_TAG")
     end
     assert_equal([
-                   {
-                     path: "c_glib/configure.ac",
-                     hunks: [
-                       ["-m4_define([arrow_glib_version], #{@release_version})",
-                        "+m4_define([arrow_glib_version], #{@next_snapshot_version})"],
-                     ],
-                   },
                    {
                      path: "c_glib/meson.build",
                      hunks: [
@@ -549,6 +540,13 @@ def test_version_post_tag
                         "+version = \"#{@next_snapshot_version}\""],
                      ],
                    },
+                   {
+                    path: "rust/datafusion-examples/Cargo.toml",
+                    hunks: [
+                      ["-version = \"#{@release_version}\"",
+                      "+version = \"#{@next_snapshot_version}\""],
+                  ],
+                   },
                    {
                      path: "rust/datafusion/Cargo.toml",
                      hunks: [
@@ -558,8 +556,6 @@ def test_version_post_tag
                         "-parquet = { path = \"../parquet\", version = \"#{@release_version}\", features = [\"arrow\"] }",
                         "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\", features = [\"prettyprint\"] }",
                         "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\", features = [\"arrow\"] }"],
-                       ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }",
-                        "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@next_snapshot_version}\" }"]
                      ],
                    },
                    {
diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh
index 03c474449d4..3e3ce19656a 100755
--- a/dev/release/00-prepare.sh
+++ b/dev/release/00-prepare.sh
@@ -43,14 +43,11 @@ update_versions() {
   esac
 
   cd "${SOURCE_DIR}/../../c_glib"
-  sed -i.bak -E -e \
-    "s/^m4_define\(\[arrow_glib_version\], .+\)/m4_define([arrow_glib_version], ${version})/" \
-    configure.ac
   sed -i.bak -E -e \
     "s/^version = '.+'/version = '${version}'/" \
     meson.build
-  rm -f configure.ac.bak meson.build.bak
-  git add configure.ac meson.build
+  rm -f meson.build.bak
+  git add meson.build
   cd -
 
   cd "${SOURCE_DIR}/../../ci/scripts"
diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb
index f354c681b5b..7d92881f282 100644
--- a/dev/release/02-source-test.rb
+++ b/dev/release/02-source-test.rb
@@ -53,17 +53,6 @@ def test_symbolic_links
     end
   end
 
-  def test_glib_configure
-    source("GLIB")
-    Dir.chdir("#{@tag_name}/c_glib") do
-      assert_equal([
-                     "configure",
-                     "configure.ac",
-                   ],
-                   Dir.glob("configure*").sort)
-    end
-  end
-
   def test_csharp_git_commit_information
     source
     Dir.chdir("#{@tag_name}/csharp") do
diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh
index 0eca86d2835..89850e7543d 100755
--- a/dev/release/02-source.sh
+++ b/dev/release/02-source.sh
@@ -21,7 +21,6 @@
 set -e
 
 : ${SOURCE_DEFAULT:=1}
-: ${SOURCE_GLIB:=${SOURCE_DEFAULT}}
 : ${SOURCE_RAT:=${SOURCE_DEFAULT}}
 : ${SOURCE_UPLOAD:=${SOURCE_DEFAULT}}
 : ${SOURCE_VOTE:=${SOURCE_DEFAULT}}
@@ -63,24 +62,6 @@ rm -rf ${tag}
   git archive ${release_hash} --prefix ${tag}/) | \
   tar xf -
 
-# Replace c_glib/ after running c_glib/autogen.sh to create c_gilb/ source archive containing the configure script
-if [ ${SOURCE_GLIB} -gt 0 ]; then
-  archive_name=tmp-apache-arrow
-  (cd "${SOURCE_TOP_DIR}" && \
-    git archive ${release_hash} --prefix ${archive_name}/) \
-    > "${SOURCE_TOP_DIR}/${archive_name}.tar"
-  c_glib_including_configure_tar_gz=c_glib.tar.gz
-  docker build -t arrow-release-source "${SOURCE_TOP_DIR}/dev/release/source"
-  docker run \
-    -v "${SOURCE_TOP_DIR}":/arrow:delegated \
-    arrow-release-source \
-    /arrow/dev/release/source/build.sh ${archive_name} ${c_glib_including_configure_tar_gz}
-  rm -f "${SOURCE_TOP_DIR}/${archive_name}.tar"
-  rm -rf ${tag}/c_glib
-  tar xf "${SOURCE_TOP_DIR}/${c_glib_including_configure_tar_gz}" -C ${tag}
-  rm -f "${SOURCE_TOP_DIR}/${c_glib_including_configure_tar_gz}"
-fi
-
 # Resolve all hard and symbolic links
 rm -rf ${tag}.tmp
 mv ${tag} ${tag}.tmp
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 191124ce267..6eb4beedb75 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -124,6 +124,7 @@ go/arrow/internal/cpu/*
 go/arrow/type_string.go
 go/*.tmpldata
 go/*.s
+go/parquet/go.sum
 js/.npmignore
 js/closure-compiler-scripts/*
 js/src/fb/*.ts
@@ -141,33 +142,6 @@ MANIFEST.in
 __init__.pxd
 __init__.py
 requirements.txt
-c_glib/configure
-c_glib/version
-c_glib/m4/*.m4
-c_glib/config/config.sub
-c_glib/config.h.in
-c_glib/config/compile
-c_glib/config/missing
-c_glib/config/install-sh
-c_glib/config/config.guess
-c_glib/config/depcomp
-c_glib/config/ltmain.sh
-c_glib/doc/arrow-dataset-glib/arrow-dataset-glib.types
-c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-sections.txt
-c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-overrides.txt
-c_glib/doc/arrow-glib/arrow-glib.types
-c_glib/doc/arrow-glib/arrow-glib-sections.txt
-c_glib/doc/arrow-glib/arrow-glib-overrides.txt
-c_glib/doc/gandiva-glib/gandiva-glib.types
-c_glib/doc/gandiva-glib/gandiva-glib-sections.txt
-c_glib/doc/gandiva-glib/gandiva-glib-overrides.txt
-c_glib/doc/parquet-glib/parquet-glib.types
-c_glib/doc/parquet-glib/parquet-glib-sections.txt
-c_glib/doc/parquet-glib/parquet-glib-overrides.txt
-c_glib/doc/plasma-glib/plasma-glib.types
-c_glib/doc/plasma-glib/plasma-glib-sections.txt
-c_glib/doc/plasma-glib/plasma-glib-overrides.txt
-c_glib/gtk-doc.make
 csharp/.gitattributes
 csharp/dummy.git/*
 csharp/src/Apache.Arrow/Flatbuf/*
diff --git a/dev/release/source/Dockerfile b/dev/release/source/Dockerfile
deleted file mode 100644
index 7d5453b80c4..00000000000
--- a/dev/release/source/Dockerfile
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-FROM debian:buster
-
-ENV DEBIAN_FRONTEND noninteractive
-
-RUN apt update && \
-  apt install -y -V \
-    autoconf-archive \
-    gtk-doc-tools \
-    libgirepository1.0-dev \
-    libglib2.0-doc \
-    libtool \
-    pkg-config && \
-  apt clean && \
-  rm -rf /var/lib/apt/lists/*
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index d208a38428b..222e70289df 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -383,19 +383,11 @@ test_python() {
 test_glib() {
   pushd c_glib
 
-  if brew --prefix libffi > /dev/null 2>&1; then
-    PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH
-  fi
+  pip install meson
 
-  if [ -f configure ]; then
-    ./configure --prefix=$ARROW_HOME
-    make -j$NPROC
-    make install
-  else
-    meson build --prefix=$ARROW_HOME --libdir=lib
-    ninja -C build
-    ninja -C build install
-  fi
+  meson build --prefix=$ARROW_HOME --libdir=lib
+  ninja -C build
+  ninja -C build install
 
   export GI_TYPELIB_PATH=$ARROW_HOME/lib/girepository-1.0:$GI_TYPELIB_PATH
 
diff --git a/dev/tasks/crossbow.py b/dev/tasks/crossbow.py
index 60c0c5991de..030864ab6a7 100755
--- a/dev/tasks/crossbow.py
+++ b/dev/tasks/crossbow.py
@@ -322,7 +322,11 @@ def create_tree(self, files):
 
     def create_commit(self, files, parents=None, message='',
                       reference_name=None):
-        parents = parents or []
+        if parents is None:
+            # by default use the main branch as the base of the new branch
+            # required to reuse github actions cache across crossbow tasks
+            commit, _ = self.repo.resolve_refish("master")
+            parents = [commit.id]
         tree_id = self.create_tree(files)
 
         author = committer = self.signature
@@ -563,8 +567,8 @@ def put(self, job, prefix='build'):
         if job.target.remote is None:
             raise RuntimeError(
                 'Cannot determine git remote for the Arrow repository to '
-                'clone or push to, try to push the branch first to have a '
-                'remote tracking counterpart.'
+                'clone or push to, try to push the `{}` branch first to have '
+                'a remote tracking counterpart.'.format(job.target.branch)
             )
         if job.target.branch is None:
             raise RuntimeError(
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile
index 20ddef188a2..5dcc1b46b2d 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile
@@ -40,6 +40,7 @@ RUN \
   apt update ${quiet} && \
   apt install -y -V ${quiet} \
     build-essential \
+    ccache \
     cmake \
     debhelper \
     devscripts \
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile
index df518855cfb..60be9295194 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile
@@ -34,6 +34,7 @@ RUN \
   apt update ${quiet} && \
   apt install -y -V ${quiet} \
     build-essential \
+    ccache \
     clang-10 \
     cmake \
     devscripts \
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile
index c1404da6e92..ad83bfa9002 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile
@@ -34,6 +34,7 @@ RUN \
   apt update ${quiet} && \
   apt install -y -V ${quiet} \
     build-essential \
+    ccache \
     clang-10 \
     cmake \
     debhelper \
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile
index 7b9cf7e16e7..d60e6320e36 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile
@@ -34,6 +34,7 @@ RUN \
   apt update ${quiet} && \
   apt install -y -V ${quiet} \
     build-essential \
+    ccache \
     clang-11 \
     cmake \
     debhelper \
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/rules b/dev/tasks/linux-packages/apache-arrow/debian/rules
index 2adf2add082..60f6ddbe4d4 100755
--- a/dev/tasks/linux-packages/apache-arrow/debian/rules
+++ b/dev/tasks/linux-packages/apache-arrow/debian/rules
@@ -36,6 +36,7 @@ override_dh_auto_configure:
 	  -DARROW_PLASMA=$${ARROW_PLASMA}			\
 	  -DARROW_PYTHON=ON					\
 	  -DARROW_S3=ON						\
+	  -DARROW_USE_CCACHE=OFF				\
 	  -DARROW_WITH_BROTLI=ON				\
 	  -DARROW_WITH_BZ2=ON					\
 	  -DARROW_WITH_LZ4=ON					\
diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
index 0e248d6d0d7..842b3b0f014 100644
--- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
+++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
@@ -43,8 +43,6 @@
 # %define use_s3 (%{_centos_ver} >= 8)
 %define use_s3 0
 
-%define use_meson (%{_centos_ver} >= 8)
-
 %define have_rapidjson (%{_centos_ver} == 7)
 %define have_re2 (%{_centos_ver} >= 8)
 %define have_utf8proc (%{_centos_ver} == 7)
@@ -94,9 +92,6 @@ BuildRequires:	llvm-devel
 BuildRequires:	ncurses-devel
 %endif
 
-%if !%{use_meson}
-BuildRequires:	autoconf-archive
-%endif
 BuildRequires:	gobject-introspection-devel
 BuildRequires:	gtk-doc
 
@@ -134,6 +129,7 @@ cd cpp/build
   -DARROW_WITH_ZLIB=ON \
   -DARROW_WITH_ZSTD=ON \
   -DCMAKE_BUILD_TYPE=$cpp_build_type \
+  -DARROW_USE_CCACHE=OFF \
   -DCMAKE_UNITY_BUILD=ON \
   -DPARQUET_REQUIRE_ENCRYPTION=ON \
   -DPythonInterp_FIND_VERSION=ON \
@@ -150,7 +146,6 @@ make %{?_smp_mflags}
 cd -
 
 cd c_glib
-%if %{use_meson}
 pip3 install meson
 meson setup build \
   --default-library=both \
@@ -160,35 +155,14 @@ meson setup build \
   -Darrow_cpp_build_type=$cpp_build_type \
   -Dgtk_doc=true
 LD_LIBRARY_PATH=$PWD/../cpp/build/$cpp_build_type ninja -C build %{?_smp_mflags}
-%else
-./autogen.sh
-%configure \
-  --with-arrow-cpp-build-dir=$PWD/../cpp/build \
-  --with-arrow-cpp-build-type=$cpp_build_type \
-  --enable-gtk-doc
-sed -i 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' libtool
-sed -i 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' libtool
-ld_library_path=$PWD/arrow-glib/.libs/
-ld_library_path=$ld_library_path:$PWD/arrow-dataset-glib/.libs/
-ld_library_path=$ld_library_path:$PWD/parquet-glib/.libs/
-ld_library_path=$ld_library_path:$PWD/plasma-glib/.libs/
-ld_library_path=$ld_library_path:$PWD/../cpp/build/$cpp_build_type
-LD_LIBRARY_PATH=$ld_library_path make %{?_smp_mflags}
-%endif
 cd -
 
 %install
 cpp_build_type=release
 
 cd c_glib
-%if %{use_meson}
 DESTDIR=$RPM_BUILD_ROOT ninja -C build install
 ninja -C build clean
-%else
-make install DESTDIR=$RPM_BUILD_ROOT
-rm $RPM_BUILD_ROOT%{_libdir}/*.la
-make clean
-%endif
 cd -
 
 cd cpp/build
diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile b/dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile
index a218b4184fc..8c6c9d66d25 100644
--- a/dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile
@@ -28,11 +28,11 @@ RUN \
   yum install -y ${quiet} epel-release && \
   yum groupinstall -y ${quiet} "Development Tools" && \
   yum install -y ${quiet} \
-    autoconf-archive \
     bison \
     boost169-devel \
     brotli-devel \
     bzip2-devel \
+    ccache \
     cmake3 \
     flex \
     gflags-devel \
@@ -42,6 +42,7 @@ RUN \
     gtk-doc \
     libzstd-devel \
     lz4-devel \
+    ninja-build \
     openssl-devel \
     pkg-config \
     python36 \
diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile b/dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile
index fdbf910e39a..66c435c333d 100644
--- a/dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile
@@ -30,6 +30,7 @@ RUN \
     boost-devel \
     brotli-devel \
     bzip2-devel \
+    ccache \
     clang \
     cmake \
     curl-devel \
diff --git a/dev/tasks/linux-packages/apt/build.sh b/dev/tasks/linux-packages/apt/build.sh
index dab579505ec..73538e7e8d2 100755
--- a/dev/tasks/linux-packages/apt/build.sh
+++ b/dev/tasks/linux-packages/apt/build.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # -*- sh-indentation: 2; sh-basic-offset: 2 -*-
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -45,10 +45,30 @@ case "${distribution}" in
 esac
 architecture=$(dpkg-architecture -q DEB_BUILD_ARCH)
 
-run mkdir -p build
+debuild_options=()
+dpkg_buildpackage_options=(-us -uc)
+
+run mkdir -p /build
+run cd /build
+find . -not -path ./ccache -a -not -path "./ccache/*" -delete
+if which ccache > /dev/null 2>&1; then
+  export CCACHE_COMPILERCHECK=content
+  export CCACHE_COMPRESS=1
+  export CCACHE_COMPRESSLEVEL=6
+  export CCACHE_DIR="${PWD}/ccache"
+  export CCACHE_MAXSIZE=500M
+  ccache --show-stats
+  debuild_options+=(-eCCACHE_COMPILERCHECK)
+  debuild_options+=(-eCCACHE_COMPRESS)
+  debuild_options+=(-eCCACHE_COMPRESSLEVEL)
+  debuild_options+=(-eCCACHE_DIR)
+  debuild_options+=(-eCCACHE_MAXSIZE)
+  if [ -d /usr/lib/ccache ] ;then
+    debuild_options+=(--prepend-path=/usr/lib/ccache)
+  fi
+fi
 run cp /host/tmp/${PACKAGE}-${VERSION}.tar.gz \
-  build/${PACKAGE}_${VERSION}.orig.tar.gz
-run cd build
+  ${PACKAGE}_${VERSION}.orig.tar.gz
 run tar xfz ${PACKAGE}_${VERSION}.orig.tar.gz
 case "${VERSION}" in
   *~dev*)
@@ -73,9 +93,12 @@ fi
 # DEB_BUILD_OPTIONS="${DEB_BUILD_OPTIONS} noopt"
 export DEB_BUILD_OPTIONS
 if [ "${DEBUG:-no}" = "yes" ]; then
-  run debuild -us -uc
+  run debuild "${debuild_options[@]}" "${dpkg_buildpackage_options[@]}"
 else
-  run debuild -us -uc > /dev/null
+  run debuild "${debuild_options[@]}" "${dpkg_buildpackage_options[@]}" > /dev/null
+fi
+if which ccache > /dev/null 2>&1; then
+  ccache --show-stats
 fi
 run cd -
 
diff --git a/dev/tasks/linux-packages/github.linux.amd64.yml b/dev/tasks/linux-packages/github.linux.amd64.yml
index 8e0c5018cb6..1335a979f47 100644
--- a/dev/tasks/linux-packages/github.linux.amd64.yml
+++ b/dev/tasks/linux-packages/github.linux.amd64.yml
@@ -40,6 +40,12 @@ jobs:
       - name: Free Up Disk Space
         shell: bash
         run: arrow/ci/scripts/util_cleanup.sh
+      - name: Cache ccache
+        uses: actions/cache@v2
+        with:
+          path: arrow/dev/tasks/linux-packages/apache-arrow/{{ task_namespace }}/build/{{ target }}/ccache
+          key: linux-{{ task_namespace }}-ccache-{{ target }}-{{ "${{ hashFiles('arrow/cpp/**') }}" }}
+          restore-keys: linux-{{ task_namespace }}-ccache-{{ target }}-
       - name: Build
         run: |
           set -e
diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb
index 0cd98f397ca..ecb9cea646e 100644
--- a/dev/tasks/linux-packages/package-task.rb
+++ b/dev/tasks/linux-packages/package-task.rb
@@ -356,6 +356,7 @@ def define_apt_task
       end
 
       namespace :build do
+        desc "Open console"
         task :console => build_dependencies do
           apt_build(console: true) if enable_apt?
         end
@@ -489,6 +490,7 @@ def define_yum_task
       end
 
       namespace :build do
+        desc "Open console"
         task :console => build_dependencies do
           yum_build(console: true) if enable_yum?
         end
diff --git a/dev/tasks/linux-packages/yum/build.sh b/dev/tasks/linux-packages/yum/build.sh
index 88a2cf568eb..01746803adf 100755
--- a/dev/tasks/linux-packages/yum/build.sh
+++ b/dev/tasks/linux-packages/yum/build.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # -*- sh-indentation: 2; sh-basic-offset: 2 -*-
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -42,23 +42,40 @@ fi
 distribution_version=$(echo ${distribution_version} | sed -e 's/\..*$//g')
 
 architecture="$(arch)"
+lib_directory=/usr/lib64
 case "${architecture}" in
   i*86)
     architecture=i386
+    lib_directory=/usr/lib
     ;;
 esac
 
-cd
+run mkdir -p /build
+run cd /build
+find . -not -path ./ccache -a -not -path "./ccache/*" -delete
+if which ccache > /dev/null 2>&1; then
+  export CCACHE_COMPILERCHECK=content
+  export CCACHE_COMPRESS=1
+  export CCACHE_COMPRESSLEVEL=6
+  export CCACHE_MAXSIZE=500M
+  export CCACHE_DIR="${PWD}/ccache"
+  ccache --show-stats
+  if [ -d "${lib_directory}/ccache" ]; then
+    PATH="${lib_directory}/ccache:$PATH"
+  fi
+fi
 
-run mkdir -p /build/rpmbuild
+run mkdir -p rpmbuild
+run cd
+rm -rf rpmbuild
 run ln -fs /build/rpmbuild ./
 if [ -x /usr/bin/rpmdev-setuptree ]; then
   rm -rf .rpmmacros
   run rpmdev-setuptree
 else
-  run cat <<EOM > ~/.rpmmacros
+  run cat <<RPMMACROS > ~/.rpmmacros
 %_topdir ${HOME}/rpmbuild
-EOM
+RPMMACROS
   run mkdir -p rpmbuild/SOURCES
   run mkdir -p rpmbuild/SPECS
   run mkdir -p rpmbuild/BUILD
@@ -130,6 +147,10 @@ else
   fi
 fi
 
+if which ccache > /dev/null 2>&1; then
+  ccache --show-stats
+fi
+
 run mv rpmbuild/RPMS/*/* "${rpm_dir}/"
 run mv rpmbuild/SRPMS/* "${srpm_dir}/"
 
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index f0edeb32a97..1df94e4c70c 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -23,6 +23,18 @@ on:
     branches:
       - "*-github-*"
 
+env:
+  ARROW_S3: {{ arrow_s3 }}
+  MACOSX_DEPLOYMENT_TARGET: {{ macos_deployment_target }}
+  MB_PYTHON_VERSION: {{ python_version }}
+  PLAT: x86_64
+  PYARROW_BUILD_VERBOSE: 1
+  PYARROW_VERSION: {{ arrow.no_rc_version }}
+  PYTHON_VERSION: {{ python_version }}
+  VCPKG_DEFAULT_TRIPLET: x64-osx-static-release
+  VCPKG_FEATURE_FLAGS: "-manifests"
+  VCPKG_OVERLAY_TRIPLETS: {{ "${{ github.workspace }}/arrow/ci/vcpkg" }}
+
 jobs:
   build:
     name: Build wheel for OS X
@@ -31,88 +43,108 @@ jobs:
       - name: Checkout Arrow
         run: |
           set -ex
-
           git clone --no-checkout {{ arrow.remote }} arrow
           git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }}
           git -C arrow checkout FETCH_HEAD
           git -C arrow submodule update --init --recursive
 
-      - name: Build wheel
+      - name: Install System Dependencies
+        run: brew install bison ninja
+
+      # Restore from cache the previously built ports.
+      # If cache-miss, download and build vcpkg (aka "bootstrap vcpkg").
+      - name: Restore from Cache and Install Vcpkg
+        # Download and build vcpkg, without installing any port.
+        # If content is cached already, it is a no-op.
+        uses: kszucs/run-vcpkg@main
+        with:
+          # Required to prevent cache eviction on crossbow's main branch
+          # where we build pre-build the vcpkg packages
+          setupOnly: true
+          doNotSaveCache: true
+          appendedCacheKey: "-macos-{{ macos_deployment_target }}"
+          vcpkgDirectory: {{ "${{ github.workspace }}/vcpkg" }}
+          vcpkgGitCommitId: fced4bef1606260f110d74de1ae1975c2b9ac549
+
+      - name: Patch Vcpkg Ports
         run: |
-          set -ex
-
-          git clone https://github.com/matthew-brett/multibuild
-          git -C multibuild checkout 8882150df6529658700b66bec124dfb77eefca26
-
-          # source utilities required for wheel builds
-          export CONFIG_PATH=`pwd`/arrow/dev/tasks/python-wheels/osx-build.sh
-          source multibuild/common_utils.sh
-          source multibuild/travis_osx_steps.sh
-
-          # provided by travis_osx_steps
-          before_install
-
-          brew install \
-            automake \
-            bison \
-            boost \
-            ccache \
-            cmake \
+          vcpkg_patch_file="../arrow/ci/vcpkg/ports.patch"
+          cd $VCPKG_ROOT
+          if ! git apply --reverse --check --ignore-whitespace ${vcpkg_patch_file}; then
+            git apply --ignore-whitespace ${vcpkg_patch_file}
+            echo "Patch successfully applied!"
+          fi
+
+      # Now that vcpkg is installed, it is being used to run with the desired arguments.
+      - name: Install Vcpkg Dependencies
+        run: |
+          $VCPKG_ROOT/vcpkg install \
+            abseil \
+            boost-filesystem \
+            brotli \
+            bzip2 \
+            c-ares \
+            curl \
             flatbuffers \
-            flex \
-            git \
-            openblas \
-            openssl@1.1 \
+            gflags \
+            glog \
+            grpc \
+            lz4 \
+            openssl \
+            orc \
             protobuf \
-            python@3.8 \
+            rapidjson \
+            re2 \
+            snappy \
             thrift \
-            wget
-
-          # overridden by osx-build.sh
-          build_wheel arrow
+            utf8proc \
+            zlib \
+            zstd
 
-          # test the built wheels, move Homebrew directory to ensure things
-          # are properly statically-linked
-          CELLAR=$(brew --cellar)
-          sudo mv $CELLAR{,.bak}
+      {% if arrow_s3 == "ON" %}
+      - name: Install AWS SDK C++
+        run: |
+          $VCPKG_ROOT/vcpkg install \
+            aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer]
+      {% endif %}
 
-          # make a new Cellar, allowing numpy to dynamically link to openblas
-          sudo mkdir $CELLAR
-          sudo ln -s ../Cellar.bak/openblas $CELLAR/openblas
-          # and brew's openblas will need gcc
-          sudo ln -s ../Cellar.bak/gcc $CELLAR/gcc
+      - name: Setup Multibuild
+        run: |
+          git clone https://github.com/matthew-brett/multibuild
+          git -C multibuild checkout 03950c9a7feb09d215f82d6563c4ffd91274a1e1
 
-          # install the built wheel and test dependencies (osx-build.sh)
-          install_wheel arrow
+      - name: Build Wheel
+        env:
+          CONFIG_PATH: /dev/null
+        run: |
+          # configure environment and install python
+          source multibuild/common_utils.sh
+          source multibuild/travis_osx_steps.sh
+          before_install
 
-          # run unit tests before removing the system libraries (osx-build.sh)
-          ulimit -S -n 4096 && run_unit_tests arrow
+          # install python dependencies
+          pip install -r arrow/python/requirements-wheel-build.txt delocate
 
-          # run the import tests (osx-build.sh)
-          run_import_tests
+          # build the wheel
+          arrow/ci/scripts/python_wheel_macos_build.sh $(pwd)/arrow $(pwd)/build
 
-          # restore the original Cellar
-          sudo rm $CELLAR/{gcc,openblas}
-          sudo rmdir $CELLAR
-          sudo mv $CELLAR{.bak,}
-        env:
-          # pyarrow config
-          ARROW_S3: {{ arrow_s3 }}
-          MACOSX_DEPLOYMENT_TARGET: {{ macos_deployment_target }}
-          PYARROW_BUILD_VERBOSE: 1
-          PYARROW_VERSION: {{ arrow.no_rc_version }}
+      - name: Setup Python for Testing
+        uses: actions/setup-python@v2
+        with:
+          python-version: "{{ python_version }}"
 
-          # multibuild config
-          MB_PYTHON_VERSION: {{ python_version }}
-          PLAT: x86_64
+      - name: Test the Wheel
+        run: |
+          # TODO(kszucs): temporarily remove homebrew libs
+          unset MACOSX_DEPLOYMENT_TARGET
+          arrow/ci/scripts/python_wheel_macos_test.sh $(pwd)/arrow
 
       - name: Setup Crossbow
-        run: |
-          pip3 install --requirement arrow/dev/tasks/requirements-crossbow.txt
+        run: pip install --requirement arrow/dev/tasks/requirements-crossbow.txt
 
       - name: Upload artifacts
         run: |
-          python3 arrow/dev/tasks/crossbow.py \
+          python arrow/dev/tasks/crossbow.py \
             --queue-path $(pwd) \
             --queue-remote {{ queue_remote_url }} \
             upload-artifacts \
diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh
deleted file mode 100755
index 176b8b3ce5a..00000000000
--- a/dev/tasks/python-wheels/osx-build.sh
+++ /dev/null
@@ -1,170 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-# overrides multibuild's default build_wheel
-function build_wheel {
-    pip install -U pip
-
-    # ARROW-5670: Python 3.5 can fail with HTTPS error in CMake build
-    pip install setuptools_scm requests
-
-    # Include brew installed versions of flex and bison.
-    # We need them to build Thrift. The ones that come with Xcode are too old.
-    export PATH="$(brew --prefix flex)/bin:$(brew --prefix bison)/bin:$PATH"
-
-    echo `pwd`
-    echo CFLAGS=${CFLAGS}
-    echo CXXFLAGS=${CXXFLAGS}
-    echo LDFLAGS=${LDFLAGS}
-
-    pushd $1
-
-    # For bzip_ep to find the osx SDK headers
-    export SDKROOT="$(xcrun --show-sdk-path)"
-
-    # Arrow is 64-bit-only at the moment
-    export CFLAGS="-fPIC -arch x86_64 ${CFLAGS//"-arch i386"/}"
-    export CXXFLAGS="-fPIC -arch x86_64 ${CXXFLAGS//"-arch i386"} -std=c++11"
-
-    # We pin NumPy to an old version here as the NumPy version one builds
-    # with is the oldest supported one. Thanks to NumPy's guarantees our Arrow
-    # build will also work with newer NumPy versions.
-    export ARROW_HOME=`pwd`/arrow-dist
-    export PARQUET_HOME=`pwd`/arrow-dist
-
-    # If NumPy builds from the source make sure it is built against OpenBLAS
-    # See: https://github.com/numpy/numpy/issues/15947#issuecomment-686159427
-    OPENBLAS=$(brew --prefix openblas) pip install \
-        $(pip_opts) -r python/requirements-wheel-build.txt
-
-    git submodule update --init
-    export ARROW_TEST_DATA=`pwd`/testing/data
-
-    pushd cpp
-    mkdir build
-    pushd build
-    cmake -DARROW_BUILD_SHARED=ON \
-          -DARROW_BUILD_STATIC=OFF \
-          -DARROW_BUILD_TESTS=OFF \
-          -DARROW_DATASET=ON \
-          -DARROW_DEPENDENCY_SOURCE=BUNDLED \
-          -DARROW_FLIGHT=ON \
-          -DARROW_GANDIVA=OFF \
-          -DARROW_GRPC_USE_SHARED=OFF \
-          -DARROW_HDFS=ON \
-          -DARROW_JEMALLOC=ON \
-          -DARROW_MIMALLOC=ON \
-          -DARROW_OPENSSL_USE_SHARED=OFF \
-          -DARROW_ORC=OFF \
-          -DARROW_PARQUET=ON \
-          -DARROW_PLASMA=ON \
-          -DARROW_PROTOBUF_USE_SHARED=OFF \
-          -DARROW_PYTHON=ON \
-          -DARROW_RPATH_ORIGIN=ON \
-          -DARROW_S3=${ARROW_S3} \
-          -DARROW_VERBOSE_THIRDPARTY_BUILD=ON \
-          -DARROW_WITH_BROTLI=ON \
-          -DARROW_WITH_BZ2=ON \
-          -DARROW_WITH_LZ4=ON \
-          -DARROW_WITH_SNAPPY=ON \
-          -DARROW_WITH_ZLIB=ON \
-          -DARROW_WITH_ZSTD=ON \
-          -DBOOST_SOURCE=SYSTEM \
-          -DCMAKE_BUILD_TYPE=Release \
-          -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-          -DMAKE=make \
-          -DProtobuf_SOURCE=SYSTEM \
-          ..
-    make -j$(sysctl -n hw.logicalcpu)
-    make install
-    popd
-    popd
-
-    # Unset the HOME variables and use pkg-config to discover the previously
-    # built binaries. By using pkg-config, we also are able to discover the
-    # ABI and SO versions of the dynamic libraries.
-    export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PARQUET_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH}
-    unset ARROW_HOME
-    unset PARQUET_HOME
-
-    export PYARROW_BUILD_TYPE='release'
-    export PYARROW_BUNDLE_ARROW_CPP=1
-    export PYARROW_INSTALL_TESTS=1
-    export PYARROW_WITH_DATASET=1
-    export PYARROW_WITH_FLIGHT=1
-    export PYARROW_WITH_GANDIVA=0
-    export PYARROW_WITH_HDFS=1
-    export PYARROW_WITH_JEMALLOC=1
-    export PYARROW_WITH_ORC=0
-    export PYARROW_WITH_PARQUET=1
-    export PYARROW_WITH_PLASMA=1
-    export PYARROW_WITH_PLASMA=1
-    export PYARROW_WITH_S3=${ARROW_S3}
-    export SETUPTOOLS_SCM_PRETEND_VERSION=$PYARROW_VERSION
-    pushd python
-    python setup.py build_ext bdist_wheel
-    ls -l dist/
-    popd
-
-    popd
-}
-
-function install_wheel {
-    multibuild_dir=`realpath $MULTIBUILD_DIR`
-
-    pushd $1  # enter arrow's directory
-    wheelhouse="$PWD/python/dist"
-
-    # Install wheel
-    pip install $(pip_opts) $wheelhouse/*.whl
-
-    popd
-}
-
-function run_unit_tests {
-    pushd $1
-
-    export PYARROW_TEST_CYTHON=OFF
-
-    # Install test dependencies
-    pip install $(pip_opts) -r python/requirements-wheel-test.txt
-
-    # Run pyarrow tests
-    pytest -rs --pyargs pyarrow
-
-    popd
-}
-
-function run_import_tests {
-    # Test optional dependencies
-    python -c "
-import pyarrow
-import pyarrow.parquet
-import pyarrow.plasma
-import pyarrow.fs
-import pyarrow._hdfs
-import pyarrow.dataset
-import pyarrow.flight
-"
-    if [ "$ARROW_S3" = "ON" ]; then
-        python -c "import pyarrow._s3fs"
-    fi
-}
diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml
index c9563d82077..4c977fe2016 100644
--- a/dev/tasks/r/azure.linux.yml
+++ b/dev/tasks/r/azure.linux.yml
@@ -15,51 +15,58 @@
 # limitations under the License.
 
 jobs:
-- job: linux
-  pool:
-    vmImage: ubuntu-latest
-  timeoutInMinutes: 360
-  steps:
-    - script: |
-        set -ex
-        git clone --no-checkout {{ arrow.remote }} arrow
-        git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }}
-        git -C arrow checkout FETCH_HEAD
-        git -C arrow submodule update --init --recursive
-      displayName: Clone arrow
+  - job: linux
+    pool:
+      vmImage: ubuntu-latest
+    timeoutInMinutes: 360
+    steps:
+      - script: |
+          set -ex
+          git clone --no-checkout {{ arrow.remote }} arrow
+          git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }}
+          git -C arrow checkout FETCH_HEAD
+          git -C arrow submodule update --init --recursive
+        displayName: Clone arrow
 
-    - script: |
-        set -ex
-        docker -v
-        docker-compose -v
-        cd arrow
-        export R_ORG={{ r_org }}
-        export R_IMAGE={{ r_image }}
-        export R_TAG={{ r_tag }}
-        export DEVTOOLSET_VERSION={{ devtoolset_version|default("-1") }}
-        docker-compose pull --ignore-pull-failures r
-        docker-compose build r
-      displayName: Docker build
+      - script: |
+          set -ex
+          docker -v
+          docker-compose -v
+          cd arrow
+          export R_ORG={{ r_org }}
+          export R_IMAGE={{ r_image }}
+          export R_TAG={{ r_tag }}
+          export DEVTOOLSET_VERSION={{ devtoolset_version|default("-1") }}
+          docker-compose pull --ignore-pull-failures r
+          docker-compose build r
+        displayName: Docker build
 
-    - script: |
-        set -ex
-        cd arrow
-        export R_ORG={{ r_org }}
-        export R_IMAGE={{ r_image }}
-        export R_TAG={{ r_tag }}
-        # we have to export this (right?) because we need it in the build env
-        export ARROW_R_DEV={{ not_cran }}
-        # Note that ci/scripts/r_test.sh sets NOT_CRAN=true if ARROW_R_DEV=TRUE
-        docker-compose run \
-          -e ARROW_DATASET={{ arrow_dataset|default("") }} \
-          -e ARROW_PARQUET={{ arrow_parquet|default("") }} \
-          -e ARROW_S3={{ arrow_s3|default("") }} \
-          -e LIBARROW_MINIMAL={{ libarrow_minimal|default("") }} \
-          r
-      displayName: Docker run
+      - script: |
+          set -ex
+          cd arrow
+          export R_ORG={{ r_org }}
+          export R_IMAGE={{ r_image }}
+          export R_TAG={{ r_tag }}
+          # we have to export this (right?) because we need it in the build env
+          export ARROW_R_DEV={{ not_cran }}
+          # Note that ci/scripts/r_test.sh sets NOT_CRAN=true if ARROW_R_DEV=TRUE
+          docker-compose run \
+            -e ARROW_DATASET={{ arrow_dataset|default("") }} \
+            -e ARROW_PARQUET={{ arrow_parquet|default("") }} \
+            -e ARROW_S3={{ arrow_s3|default("") }} \
+            -e ARROW_WITH_RE2={{ arrow_with_re2|default("") }} \
+            -e ARROW_WITH_UTF8PROC={{ arrow_with_utf8proc|default("") }} \
+            -e LIBARROW_MINIMAL={{ libarrow_minimal|default("") }} \
+            r
+        displayName: Docker run
 
-    - script: |
-        set -ex
-        cat arrow/r/check/arrow.Rcheck/00install.out
-      displayName: Dump install logs
-      condition: succeededOrFailed()
+      - script: |
+          set -ex
+          cat arrow/r/check/arrow.Rcheck/00install.out
+        displayName: Dump install logs
+        condition: succeededOrFailed()
+      - script: |
+          set -ex
+          cat arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
+        displayName: Dump test logs
+        condition: succeededOrFailed()
diff --git a/dev/tasks/r/github.linux.cran.yml b/dev/tasks/r/github.linux.cran.yml
index 1e5ed4f021d..03d22dcbfef 100644
--- a/dev/tasks/r/github.linux.cran.yml
+++ b/dev/tasks/r/github.linux.cran.yml
@@ -68,3 +68,12 @@ jobs:
       - name: Dump install logs
         run: cat arrow/r/check/arrow.Rcheck/00install.out
         if: always()
+      - name: Dump test logs
+        run: cat arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
+        if: always()
+      - name: Save the test output
+        if: always()
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-output
+          path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
diff --git a/dev/tasks/r/github.linux.versions.yml b/dev/tasks/r/github.linux.versions.yml
index 9ddeb1e9401..25f1f8a6557 100644
--- a/dev/tasks/r/github.linux.versions.yml
+++ b/dev/tasks/r/github.linux.versions.yml
@@ -69,3 +69,12 @@ jobs:
       - name: Dump install logs
         run: cat arrow/r/check/arrow.Rcheck/00install.out
         if: always()
+      - name: Dump test logs
+        run: cat arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
+        if: always()
+      - name: Save the test output
+        if: always()
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-output
+          path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
diff --git a/dev/tasks/r/github.macos.autobrew.yml b/dev/tasks/r/github.macos.autobrew.yml
index 12812114eb2..1b8500f64b3 100644
--- a/dev/tasks/r/github.macos.autobrew.yml
+++ b/dev/tasks/r/github.macos.autobrew.yml
@@ -67,3 +67,12 @@ jobs:
       - name: Dump install logs
         run: cat arrow/r/check/arrow.Rcheck/00install.out
         if: always()
+      - name: Dump test logs
+        run: cat arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
+        if: always()
+      - name: Save the test output
+        if: always()
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-output
+          path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index c31c3d7b441..a04c2a5b5e3 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1457,14 +1457,6 @@ tasks:
         DEBIAN: 10
       run: debian-cpp
 
-  test-ubuntu-16.04-cpp:
-    ci: github
-    template: docker-tests/github.linux.yml
-    params:
-      env:
-        UBUNTU: 16.04
-      run: ubuntu-cpp
-
   test-ubuntu-18.04-cpp:
     ci: github
     template: docker-tests/github.linux.yml
@@ -1709,6 +1701,8 @@ tasks:
       arrow_dataset: "OFF"
       arrow_parquet: "OFF"
       arrow_s3: "OFF"
+      arrow_with_re2: "OFF"
+      arrow_with_utf8proc: "OFF"
       libarrow_minimal: "TRUE"
 
   test-ubuntu-18.04-r-sanitizer:
diff --git a/docker-compose.yml b/docker-compose.yml
index b26348cfcf8..cdcae905dd5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -159,10 +159,6 @@ services:
   #     -e ARROW_BUILD_SHARED=OFF \
   #     -e ARROW_TEST_LINKAGE=static \
   #     conda-cpp|debian-cpp|...
-  # Minimum boost - Ubuntu Xenial 16.04 has Boost 1.58:
-  #   UBUNTU=16.04 docker-compose build \
-  #     -e BOOST_SOURCE=SYSTEM \
-  #     ubuntu-cpp
 
   conda-cpp:
     # C++ build in conda environment, including the doxygen docs.
@@ -265,7 +261,7 @@ services:
     #   docker-compose run --rm ubuntu-cpp
     # Parameters:
     #   ARCH: amd64, arm64v8, s390x, ...
-    #   UBUNTU: 16.04, 18.04, 20.04
+    #   UBUNTU: 18.04, 20.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp
     build:
       context: .
@@ -322,7 +318,7 @@ services:
     #   docker-compose run --rm ubuntu-cpp-sanitizer
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
-    #   UBUNTU: 16.04, 18.04, 20.04
+    #   UBUNTU: 18.04, 20.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp
     cap_add:
       # For LeakSanitizer
@@ -358,7 +354,7 @@ services:
     #   docker-compose run --rm ubuntu-cpp-thread-sanitizer
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
-    #   UBUNTU: 16.04, 18.04, 20.04
+    #   UBUNTU: 18.04, 20.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp
     build:
       context: .
@@ -447,7 +443,7 @@ services:
     #   docker-compose run --rm ubuntu-c-glib
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
-    #   UBUNTU: 16.04, 18.04, 20.04
+    #   UBUNTU: 18.04, 20.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-c-glib
     build:
       context: .
@@ -507,7 +503,7 @@ services:
     #   docker-compose run --rm ubuntu-ruby
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
-    #   UBUNTU: 16.04, 18.04, 20.04
+    #   UBUNTU: 18.04, 20.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-ruby
     build:
       context: .
@@ -608,7 +604,7 @@ services:
     #   docker-compose run --rm ubuntu-python
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
-    #   UBUNTU: 16.04, 18.04, 20.04
+    #   UBUNTU: 18.04, 20.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3
     build:
       context: .
@@ -673,7 +669,7 @@ services:
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
     #   PYARROW_VERSION: The test target pyarrow version such as "3.0.0"
-    #   UBUNTU: 16.04, 18.04, 20.04
+    #   UBUNTU: 18.04, 20.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3
     build:
       context: .
diff --git a/docs/source/cpp/api/flight.rst b/docs/source/cpp/api/flight.rst
index 3c893f3e42f..7cefd66ef84 100644
--- a/docs/source/cpp/api/flight.rst
+++ b/docs/source/cpp/api/flight.rst
@@ -25,9 +25,6 @@ Arrow Flight RPC
 .. note:: Flight is currently unstable. APIs are subject to change,
           though we don't expect drastic changes.
 
-.. note:: Flight is currently only available when built from source
-          appropriately.
-
 Common Types
 ============
 
@@ -101,7 +98,7 @@ Clients
    :project: arrow_cpp
    :members:
 
-.. doxygenclass:: arrow::flight::FlightClientOptions
+.. doxygenstruct:: arrow::flight::FlightClientOptions
    :project: arrow_cpp
    :members:
 
diff --git a/docs/source/cpp/api/tensor.rst b/docs/source/cpp/api/tensor.rst
index 1493441f5fb..1d51786db03 100644
--- a/docs/source/cpp/api/tensor.rst
+++ b/docs/source/cpp/api/tensor.rst
@@ -50,4 +50,8 @@ Sparse Tensors
 
 .. doxygentypedef:: arrow::SparseCOOTensor
 
+.. doxygentypedef:: arrow::SparseCSCMatrix
+
+.. doxygentypedef:: arrow::SparseCSFTensor
+
 .. doxygentypedef:: arrow::SparseCSRMatrix
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 00016330075..065b80736aa 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -86,20 +86,20 @@ Many compute functions are also available directly as concrete APIs, here
 Some functions accept or require an options structure that determines the
 exact semantics of the function::
 
-   MinMaxOptions options;
-   options.null_handling = MinMaxOptions::EMIT_NULL;
+   MinMaxOptions min_max_options;
+   min_max_options.null_handling = MinMaxOptions::EMIT_NULL;
 
    std::shared_ptr<arrow::Array> array = ...;
-   arrow::Datum min_max_datum;
+   arrow::Datum min_max;
 
-   ARROW_ASSIGN_OR_RAISE(min_max_datum,
-                         arrow::compute::CallFunction("min_max", {array}, &options));
+   ARROW_ASSIGN_OR_RAISE(min_max,
+                         arrow::compute::CallFunction("min_max", {array},
+                                                      &min_max_options));
 
    // Unpack struct scalar result (a two-field {"min", "max"} scalar)
-   const auto& min_max_scalar = \
-         static_cast<const arrow::StructScalar&>(*min_max_datum.scalar());
-   const auto min_value = min_max_scalar.value[0];
-   const auto max_value = min_max_scalar.value[1];
+   std::shared_ptr<arrow::Scalar> min_value, max_value;
+   min_value = min_max.scalar_as<arrow::StructScalar>().value[0];
+   max_value = min_max.scalar_as<arrow::StructScalar>().value[1];
 
 .. seealso::
    :doc:`Compute API reference <api/compute>`
@@ -426,19 +426,25 @@ The third set of functions examines string elements on a byte-per-byte basis:
 String transforms
 ~~~~~~~~~~~~~~~~~
 
-+--------------------------+------------+-------------------------+---------------------+---------+
-| Function name            | Arity      | Input types             | Output type         | Notes   |
-+==========================+============+=========================+=====================+=========+
-| ascii_lower              | Unary      | String-like             | String-like         | \(1)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| ascii_upper              | Unary      | String-like             | String-like         | \(1)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(2)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| utf8_lower               | Unary      | String-like             | String-like         | \(3)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| utf8_upper               | Unary      | String-like             | String-like         | \(3)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
++--------------------------+------------+-------------------------+---------------------+-------------------------------------------------+
+| Function name            | Arity      | Input types             | Output type         | Notes   | Options class                         |
++==========================+============+=========================+=====================+=========+=======================================+
+| ascii_lower              | Unary      | String-like             | String-like         | \(1)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| ascii_upper              | Unary      | String-like             | String-like         | \(1)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(2)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| replace_substring        | Unary      | String-like             | String-like         | \(3)    | :struct:`ReplaceSubstringOptions`     |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| replace_substring_regex  | Unary      | String-like             | String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| utf8_length              | Unary      | String-like             | Int32 or Int64      | \(5)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| utf8_lower               | Unary      | String-like             | String-like         | \(6)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| utf8_upper               | Unary      | String-like             | String-like         | \(6)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 
 
 * \(1) Each ASCII character in the input is converted to lowercase or
@@ -447,7 +453,23 @@ String transforms
 * \(2) Output is the physical length in bytes of each input element.  Output
   type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.
 
-* \(3) Each UTF8-encoded character in the input is converted to lowercase or
+* \(3) Replace non-overlapping substrings that match to
+  :member:`ReplaceSubstringOptions::pattern` by
+  :member:`ReplaceSubstringOptions::replacement`. If
+  :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
+  maximum number of replacements made, counting from the left.
+
+* \(4) Replace non-overlapping substrings that match to the regular expression
+  :member:`ReplaceSubstringOptions::pattern` by
+  :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
+  :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
+  maximum number of replacements made, counting from the left. Note that if the
+  pattern contains groups, backreferencing can be used.
+
+* \(5) Output is the number of characters (not bytes) of each input element.
+  Output type is Int32 for String, Int64 for LargeString. 
+
+* \(6) Each UTF8-encoded character in the input is converted to lowercase or
   uppercase.
 
 
diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
index 9cc61a48c50..22eb5159dfd 100644
--- a/docs/source/developers/benchmarks.rst
+++ b/docs/source/developers/benchmarks.rst
@@ -33,22 +33,23 @@ The benchmark suites can be run with the ``benchmark run`` sub-command.
 
 .. code-block:: shell
 
-  # Run benchmarks in the current git workspace
-  archery benchmark run
-  # Storing the results in a file
-  archery benchmark run --output=run.json
+   # Run benchmarks in the current git workspace
+   archery benchmark run
+   # Storing the results in a file
+   archery benchmark run --output=run.json
 
 Sometimes, it is required to pass custom CMake flags, e.g.
 
 .. code-block:: shell
 
-  export CC=clang-8 CXX=clang++8
-  archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE"
+   export CC=clang-8 CXX=clang++8
+   archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE"
 
 Additionally a full CMake build directory may be specified.
 
 .. code-block:: shell
-  archery benchmark run $HOME/arrow/cpp/release-build
+
+   archery benchmark run $HOME/arrow/cpp/release-build
 
 Comparison
 ==========
@@ -91,14 +92,14 @@ avoid rebuilding sources from zero.
 
 .. code-block:: shell
 
-  # First invocation clone and checkouts in a temporary directory. The
-  # directory is preserved with --preserve
-  archery benchmark diff --preserve
+   # First invocation clone and checkouts in a temporary directory. The
+   # directory is preserved with --preserve
+   archery benchmark diff --preserve
 
-  # Modify C++ sources
+   # Modify C++ sources
 
-  # Re-run benchmark in the previously created build directory.
-  archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build
+   # Re-run benchmark in the previously created build directory.
+   archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build
 
 Second, a benchmark run result can be saved in a json file. This also avoids
 rebuilding the sources, but also executing the (sometimes) heavy benchmarks.
@@ -106,10 +107,10 @@ This technique can be used as a poor's man caching.
 
 .. code-block:: shell
 
-  # Run the benchmarks on a given commit and save the result
-  archery benchmark run --output=run-head-1.json HEAD~1
-  # Compare the previous captured result with HEAD
-  archery benchmark diff HEAD run-head-1.json
+   # Run the benchmarks on a given commit and save the result
+   archery benchmark run --output=run-head-1.json HEAD~1
+   # Compare the previous captured result with HEAD
+   archery benchmark diff HEAD run-head-1.json
 
 Third, the benchmark command supports filtering suites (``--suite-filter``)
 and benchmarks (``--benchmark-filter``), both options supports regular
@@ -117,21 +118,22 @@ expressions.
 
 .. code-block:: shell
 
-  # Taking over a previous run, but only filtering for benchmarks matching
-  # `Kernel` and suite matching `compute-aggregate`.
-  archery benchmark diff                                       \
-    --suite-filter=compute-aggregate --benchmark-filter=Kernel \
-    /tmp/arrow-bench*/{WORKSPACE,master}/build
+   # Taking over a previous run, but only filtering for benchmarks matching
+   # `Kernel` and suite matching `compute-aggregate`.
+   archery benchmark diff                                       \
+     --suite-filter=compute-aggregate --benchmark-filter=Kernel \
+     /tmp/arrow-bench*/{WORKSPACE,master}/build
 
 Instead of rerunning benchmarks on comparison, a JSON file (generated by
 ``archery benchmark run``) may be specified for the contender and/or the
 baseline.
 
 .. code-block:: shell
-  archery benchmark run --output=baseline.json $HOME/arrow/cpp/release-build
-  git checkout some-feature
-  archery benchmark run --output=contender.json $HOME/arrow/cpp/release-build
-  archery benchmark diff contender.json baseline.json
+
+   archery benchmark run --output=baseline.json $HOME/arrow/cpp/release-build
+   git checkout some-feature
+   archery benchmark run --output=contender.json $HOME/arrow/cpp/release-build
+   archery benchmark diff contender.json baseline.json
 
 Regression detection
 ====================
@@ -174,7 +176,4 @@ output. This can be controlled/avoided with the ``--quiet`` option or the
 
 .. code-block:: shell
 
-  archery benchmark diff --benchmark-filter=Kernel --output=compare.json
-  ...
-
-
+   archery benchmark diff --benchmark-filter=Kernel --output=compare.json ...
diff --git a/docs/source/python/api/flight.rst b/docs/source/python/api/flight.rst
index f6304d7fbdd..0cfbb6b4bdd 100644
--- a/docs/source/python/api/flight.rst
+++ b/docs/source/python/api/flight.rst
@@ -33,10 +33,6 @@ Arrow Flight
 .. warning:: Flight is currently unstable. APIs are subject to change,
              though we don't expect drastic changes.
 
-.. warning:: Flight is currently not distributed as part of wheels or
-             in Conda - it is only available when built from source
-             appropriately.
-
 Common Types
 ------------
 
diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst
index 29503afe409..614c2bf2a25 100644
--- a/docs/source/python/dataset.rst
+++ b/docs/source/python/dataset.rst
@@ -193,6 +193,38 @@ testing, and boolean combinations (``&``, ``|``, ``~``):
 Note that :class:`Expression` objects can **not** be combined by python logical
 operators ``and``, ``or`` and ``not``.
 
+Projecting columns
+------------------
+
+The ``columns`` keyword can be used to read a subset of the columns of the
+dataset by passing it a list of column names. The keyword can also be used
+for more complex projections in combination with expressions.
+
+In this case, we pass it a dictionary with the keys being the resulting
+column names and the values the expression that is used to construct the column
+values:
+
+.. ipython:: python
+
+    projection = {
+        "a_renamed": ds.field("a"),
+        "b_as_float32": ds.field("b").cast("float32"),
+        "c_1": ds.field("c") == 1,
+    }
+    dataset.to_table(columns=projection).to_pandas().head()
+
+The dictionary also determines the column selection (only the keys in the
+dictionary will be present as columns in the resulting table). If you want
+to include a derived column in *addition* to the existing columns, you can
+build up the dictionary from the dataset schema:
+
+.. ipython:: python
+
+    projection = {col: ds.field(col) for col in dataset.schema.names}
+    projection.update({"b_large": ds.field("b") > 1})
+    dataset.to_table(columns=projection).to_pandas().head()
+
+
 Reading partitioned data
 ------------------------
 
diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst
index 6737b8d8633..738a7369f70 100644
--- a/docs/source/python/extending.rst
+++ b/docs/source/python/extending.rst
@@ -16,12 +16,15 @@
 .. under the License.
 
 .. currentmodule:: pyarrow
+.. cpp:namespace:: arrow
+
 .. _extending:
 
 Using pyarrow from C++ and Cython Code
 ======================================
 
-pyarrow features both a Cython and C++ API.
+pyarrow provides both a Cython and C++ API, allowing your own native code
+to interact with pyarrow objects.
 
 C++ API
 -------
@@ -50,8 +53,8 @@ When building C extensions that use the Arrow C++ libraries, you must add
 appropriate linker flags. We have provided functions ``pyarrow.get_libraries``
 and ``pyarrow.get_library_dirs`` which return a list of library names and
 likely library install locations (if you installed pyarrow with pip or
-conda). These must be included when declaring your C extensions with distutils
-(see below).
+conda). These must be included when declaring your C extensions with
+setuptools (see below).
 
 Initializing the API
 ~~~~~~~~~~~~~~~~~~~~
@@ -71,157 +74,184 @@ pyarrow provides the following functions to go back and forth between
 Python wrappers (as exposed by the pyarrow Python API) and the underlying
 C++ objects.
 
-.. function:: bool is_array(PyObject* obj)
+.. function:: bool arrow::py::is_array(PyObject* obj)
 
    Return whether *obj* wraps an Arrow C++ :class:`Array` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.Array` instance.
 
-.. function:: bool is_buffer(PyObject* obj)
+.. function:: bool arrow::py::is_batch(PyObject* obj)
+
+   Return whether *obj* wraps an Arrow C++ :class:`RecordBatch` pointer;
+   in other words, whether *obj* is a :py:class:`pyarrow.RecordBatch` instance.
+
+.. function:: bool arrow::py::is_buffer(PyObject* obj)
 
    Return whether *obj* wraps an Arrow C++ :class:`Buffer` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.Buffer` instance.
 
-.. function:: bool is_data_type(PyObject* obj)
+.. function:: bool arrow::py::is_data_type(PyObject* obj)
 
    Return whether *obj* wraps an Arrow C++ :class:`DataType` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.DataType` instance.
 
-.. function:: bool is_field(PyObject* obj)
+.. function:: bool arrow::py::is_field(PyObject* obj)
 
    Return whether *obj* wraps an Arrow C++ :class:`Field` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.Field` instance.
 
-.. function:: bool is_record_batch(PyObject* obj)
+.. function:: bool arrow::py::is_scalar(PyObject* obj)
 
-   Return whether *obj* wraps an Arrow C++ :class:`RecordBatch` pointer;
-   in other words, whether *obj* is a :py:class:`pyarrow.RecordBatch` instance.
+   Return whether *obj* wraps an Arrow C++ :class:`Scalar` pointer;
+   in other words, whether *obj* is a :py:class:`pyarrow.Scalar` instance.
 
-.. function:: bool is_schema(PyObject* obj)
+.. function:: bool arrow::py::is_schema(PyObject* obj)
 
    Return whether *obj* wraps an Arrow C++ :class:`Schema` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.Schema` instance.
 
-.. function:: bool is_table(PyObject* obj)
+.. function:: bool arrow::py::is_table(PyObject* obj)
 
    Return whether *obj* wraps an Arrow C++ :class:`Table` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.Table` instance.
 
-.. function:: bool is_tensor(PyObject* obj)
+.. function:: bool arrow::py::is_tensor(PyObject* obj)
 
    Return whether *obj* wraps an Arrow C++ :class:`Tensor` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.Tensor` instance.
 
-.. function:: bool is_sparse_coo_tensor(PyObject* obj)
+.. function:: bool arrow::py::is_sparse_coo_tensor(PyObject* obj)
 
-   Return whether *obj* wraps an Arrow C++ :class:`SparseCOOTensor` pointer;
+   Return whether *obj* wraps an Arrow C++ :type:`SparseCOOTensor` pointer;
    in other words, whether *obj* is a :py:class:`pyarrow.SparseCOOTensor` instance.
 
-.. function:: bool is_sparse_csr_matrix(PyObject* obj)
+.. function:: bool arrow::py::is_sparse_csc_matrix(PyObject* obj)
 
-   Return whether *obj* wraps an Arrow C++ :class:`SparseCSRMatrix` pointer;
-   in other words, whether *obj* is a :py:class:`pyarrow.SparseCSRMatrix` instance.
+   Return whether *obj* wraps an Arrow C++ :type:`SparseCSCMatrix` pointer;
+   in other words, whether *obj* is a :py:class:`pyarrow.SparseCSCMatrix` instance.
 
-.. function:: bool is_sparse_csc_matrix(PyObject* obj)
+.. function:: bool arrow::py::is_sparse_csf_tensor(PyObject* obj)
+
+   Return whether *obj* wraps an Arrow C++ :type:`SparseCSFTensor` pointer;
+   in other words, whether *obj* is a :py:class:`pyarrow.SparseCSFTensor` instance.
+
+.. function:: bool arrow::py::is_sparse_csr_matrix(PyObject* obj)
+
+   Return whether *obj* wraps an Arrow C++ :type:`SparseCSRMatrix` pointer;
+   in other words, whether *obj* is a :py:class:`pyarrow.SparseCSRMatrix` instance.
 
-   Return whether *obj* wraps an Arrow C++ :class:`SparseCSCMatrix` pointer;
-   in other words, whether *obj* is a :py:class:`pyarrow.SparseCSCMatrix` instance.
 
 The following functions expect a pyarrow object, unwrap the underlying
-Arrow C++ API pointer, and put it in the *out* parameter.  The returned
-:class:`Status` object must be inspected first to know whether any error
-occurred.  If successful, *out* is guaranteed to be non-NULL.
+Arrow C++ API pointer, and return it as a :class:`Result` object.  An error
+may be returned if the input object doesn't have the expected type.
+
+.. function:: Result<std::shared_ptr<Array>> arrow::py::unwrap_array(PyObject* obj)
+
+   Unwrap and return the Arrow C++ :class:`Array` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<RecordBatch>> arrow::py::unwrap_batch(PyObject* obj)
 
-.. function:: Status unwrap_array(PyObject* obj, std::shared_ptr<Array>* out)
+   Unwrap and return the Arrow C++ :class:`RecordBatch` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`Array` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<Buffer>> arrow::py::unwrap_buffer(PyObject* obj)
 
-.. function:: Status unwrap_buffer(PyObject* obj, std::shared_ptr<Buffer>* out)
+   Unwrap and return the Arrow C++ :class:`Buffer` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`Buffer` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<DataType>> arrow::py::unwrap_data_type(PyObject* obj)
 
-.. function:: Status unwrap_data_type(PyObject* obj, std::shared_ptr<DataType>* out)
+   Unwrap and return the Arrow C++ :class:`DataType` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`DataType` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<Field>> arrow::py::unwrap_field(PyObject* obj)
 
-.. function:: Status unwrap_field(PyObject* obj, std::shared_ptr<Field>* out)
+   Unwrap and return the Arrow C++ :class:`Field` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`Field` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<Scalar>> arrow::py::unwrap_scalar(PyObject* obj)
 
-.. function:: Status unwrap_record_batch(PyObject* obj, std::shared_ptr<RecordBatch>* out)
+   Unwrap and return the Arrow C++ :class:`Scalar` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`RecordBatch` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<Schema>> arrow::py::unwrap_schema(PyObject* obj)
 
-.. function:: Status unwrap_schema(PyObject* obj, std::shared_ptr<Schema>* out)
+   Unwrap and return the Arrow C++ :class:`Schema` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`Schema` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<Table>> arrow::py::unwrap_table(PyObject* obj)
 
-.. function:: Status unwrap_table(PyObject* obj, std::shared_ptr<Table>* out)
+   Unwrap and return the Arrow C++ :class:`Table` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`Table` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<Tensor>> arrow::py::unwrap_tensor(PyObject* obj)
 
-.. function:: Status unwrap_tensor(PyObject* obj, std::shared_ptr<Tensor>* out)
+   Unwrap and return the Arrow C++ :class:`Tensor` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`Tensor` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<SparseCOOTensor>> arrow::py::unwrap_sparse_coo_tensor(PyObject* obj)
 
-.. function:: Status unwrap_sparse_coo_tensor(PyObject* obj, std::shared_ptr<SparseCOOTensor>* out)
+   Unwrap and return the Arrow C++ :type:`SparseCOOTensor` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`SparseCOOTensor` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<SparseCSCMatrix>> arrow::py::unwrap_sparse_csc_matrix(PyObject* obj)
 
-.. function:: Status unwrap_sparse_csr_matrix(PyObject* obj, std::shared_ptr<SparseCSRMatrix>* out)
+   Unwrap and return the Arrow C++ :type:`SparseCSCMatrix` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`SparseCSRMatrix` pointer from *obj* and put it in *out*.
+.. function:: Result<std::shared_ptr<SparseCSFTensor>> arrow::py::unwrap_sparse_csf_tensor(PyObject* obj)
 
-.. function:: Status unwrap_sparse_csc_matrix(PyObject* obj, std::shared_ptr<SparseCSCMatrix>* out)
+   Unwrap and return the Arrow C++ :type:`SparseCSFTensor` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<SparseCSRMatrix>> arrow::py::unwrap_sparse_csr_matrix(PyObject* obj)
+
+   Unwrap and return the Arrow C++ :type:`SparseCSRMatrix` pointer from *obj*.
 
-   Unwrap the Arrow C++ :class:`SparseCSCMatrix` pointer from *obj* and put it in *out*.
 
 The following functions take an Arrow C++ API pointer and wrap it in a
 pyarray object of the corresponding type.  A new reference is returned.
 On error, NULL is returned and a Python exception is set.
 
-.. function:: PyObject* wrap_array(const std::shared_ptr<Array>& array)
+.. function:: PyObject* arrow::py::wrap_array(const std::shared_ptr<Array>& array)
 
    Wrap the Arrow C++ *array* in a :py:class:`pyarrow.Array` instance.
 
-.. function:: PyObject* wrap_buffer(const std::shared_ptr<Buffer>& buffer)
+.. function:: PyObject* arrow::py::wrap_batch(const std::shared_ptr<RecordBatch>& batch)
+
+   Wrap the Arrow C++ record *batch* in a :py:class:`pyarrow.RecordBatch` instance.
+
+.. function:: PyObject* arrow::py::wrap_buffer(const std::shared_ptr<Buffer>& buffer)
 
    Wrap the Arrow C++ *buffer* in a :py:class:`pyarrow.Buffer` instance.
 
-.. function:: PyObject* wrap_data_type(const std::shared_ptr<DataType>& data_type)
+.. function:: PyObject* arrow::py::wrap_data_type(const std::shared_ptr<DataType>& data_type)
 
    Wrap the Arrow C++ *data_type* in a :py:class:`pyarrow.DataType` instance.
 
-.. function:: PyObject* wrap_field(const std::shared_ptr<Field>& field)
+.. function:: PyObject* arrow::py::wrap_field(const std::shared_ptr<Field>& field)
 
    Wrap the Arrow C++ *field* in a :py:class:`pyarrow.Field` instance.
 
-.. function:: PyObject* wrap_record_batch(const std::shared_ptr<RecordBatch>& batch)
+.. function:: PyObject* arrow::py::wrap_scalar(const std::shared_ptr<Scalar>& scalar)
 
-   Wrap the Arrow C++ record *batch* in a :py:class:`pyarrow.RecordBatch` instance.
+   Wrap the Arrow C++ *scalar* in a :py:class:`pyarrow.Scalar` instance.
 
-.. function:: PyObject* wrap_schema(const std::shared_ptr<Schema>& schema)
+.. function:: PyObject* arrow::py::wrap_schema(const std::shared_ptr<Schema>& schema)
 
    Wrap the Arrow C++ *schema* in a :py:class:`pyarrow.Schema` instance.
 
-.. function:: PyObject* wrap_table(const std::shared_ptr<Table>& table)
+.. function:: PyObject* arrow::py::wrap_table(const std::shared_ptr<Table>& table)
 
    Wrap the Arrow C++ *table* in a :py:class:`pyarrow.Table` instance.
 
-.. function:: PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor)
+.. function:: PyObject* arrow::py::wrap_tensor(const std::shared_ptr<Tensor>& tensor)
 
    Wrap the Arrow C++ *tensor* in a :py:class:`pyarrow.Tensor` instance.
 
-.. function:: PyObject* wrap_sparse_coo_tensor(const std::shared_ptr<SparseCOOTensor>& sparse_tensor)
+.. function:: PyObject* arrow::py::wrap_sparse_coo_tensor(const std::shared_ptr<SparseCOOTensor>& sparse_tensor)
+
+   Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCOOTensor` instance.
+
+.. function:: PyObject* arrow::py::wrap_sparse_csc_matrix(const std::shared_ptr<SparseCSCMatrix>& sparse_tensor)
 
-   Wrap the Arrow C++ *COO sparse tensor* in a :py:class:`pyarrow.SparseCOOTensor` instance.
+   Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSCMatrix` instance.
 
-.. function:: PyObject* wrap_sparse_csr_matrix(const std::shared_ptr<SparseCSRMatrix>& sparse_tensor)
+.. function:: PyObject* arrow::py::wrap_sparse_csf_tensor(const std::shared_ptr<SparseCSFTensor>& sparse_tensor)
 
-   Wrap the Arrow C++ *CSR sparse tensor* in a :py:class:`pyarrow.SparseCSRMatrix` instance.
+   Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSFTensor` instance.
 
-.. function:: PyObject* wrap_sparse_csc_matrix(const std::shared_ptr<SparseCSCMatrix>& sparse_tensor)
+.. function:: PyObject* arrow::py::wrap_sparse_csr_matrix(const std::shared_ptr<SparseCSRMatrix>& sparse_tensor)
 
-   Wrap the Arrow C++ *CSC sparse tensor* in a :py:class:`pyarrow.SparseCSCMatrix` instance.
+   Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSRMatrix` instance.
 
 
 Cython API
@@ -267,6 +297,10 @@ an exception) if the input is not of the right type.
 
    Unwrap the Arrow C++ :cpp:class:`Field` pointer from *obj*.
 
+.. function:: pyarrow_unwrap_scalar(obj) -> shared_ptr[CScalar]
+
+   Unwrap the Arrow C++ :cpp:class:`Scalar` pointer from *obj*.
+
 .. function:: pyarrow_unwrap_schema(obj) -> shared_ptr[CSchema]
 
    Unwrap the Arrow C++ :cpp:class:`Schema` pointer from *obj*.
@@ -281,66 +315,80 @@ an exception) if the input is not of the right type.
 
 .. function:: pyarrow_unwrap_sparse_coo_tensor(obj) -> shared_ptr[CSparseCOOTensor]
 
-   Unwrap the Arrow C++ :cpp:class:`SparseCOOTensor` pointer from *obj*.
+   Unwrap the Arrow C++ :cpp:type:`SparseCOOTensor` pointer from *obj*.
 
-.. function:: pyarrow_unwrap_sparse_csr_matrix(obj) -> shared_ptr[CSparseCSRMatrix]
+.. function:: pyarrow_unwrap_sparse_csc_matrix(obj) -> shared_ptr[CSparseCSCMatrix]
 
-   Unwrap the Arrow C++ :cpp:class:`SparseCSRMatrix` pointer from *obj*.
+   Unwrap the Arrow C++ :cpp:type:`SparseCSCMatrix` pointer from *obj*.
 
-.. function:: pyarrow_unwrap_sparse_csc_matrix(obj) -> shared_ptr[CSparseCSCMatrix]
+.. function:: pyarrow_unwrap_sparse_csf_tensor(obj) -> shared_ptr[CSparseCSFTensor]
+
+   Unwrap the Arrow C++ :cpp:type:`SparseCSFTensor` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_sparse_csr_matrix(obj) -> shared_ptr[CSparseCSRMatrix]
+
+   Unwrap the Arrow C++ :cpp:type:`SparseCSRMatrix` pointer from *obj*.
 
-   Unwrap the Arrow C++ :cpp:class:`SparseCSCMatrix` pointer from *obj*.
 
 The following functions take a Arrow C++ API pointer and wrap it in a
 pyarray object of the corresponding type.  An exception is raised on error.
 
-.. function:: pyarrow_wrap_array(sp_array: const shared_ptr[CArray]& array) -> object
+.. function:: pyarrow_wrap_array(const shared_ptr[CArray]& array) -> object
 
    Wrap the Arrow C++ *array* in a Python :class:`pyarrow.Array` instance.
 
-.. function:: pyarrow_wrap_batch(sp_array: const shared_ptr[CRecordBatch]& batch) -> object
+.. function:: pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& batch) -> object
 
    Wrap the Arrow C++ record *batch* in a Python :class:`pyarrow.RecordBatch` instance.
 
-.. function:: pyarrow_wrap_buffer(sp_array: const shared_ptr[CBuffer]& buffer) -> object
+.. function:: pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buffer) -> object
 
    Wrap the Arrow C++ *buffer* in a Python :class:`pyarrow.Buffer` instance.
 
-.. function:: pyarrow_wrap_data_type(sp_array: const shared_ptr[CDataType]& data_type) -> object
+.. function:: pyarrow_wrap_data_type(const shared_ptr[CDataType]& data_type) -> object
 
    Wrap the Arrow C++ *data_type* in a Python :class:`pyarrow.DataType` instance.
 
-.. function:: pyarrow_wrap_field(sp_array: const shared_ptr[CField]& field) -> object
+.. function:: pyarrow_wrap_field(const shared_ptr[CField]& field) -> object
 
    Wrap the Arrow C++ *field* in a Python :class:`pyarrow.Field` instance.
 
-.. function:: pyarrow_wrap_resizable_buffer(sp_array: const shared_ptr[CResizableBuffer]& buffer) -> object
+.. function:: pyarrow_wrap_resizable_buffer(const shared_ptr[CResizableBuffer]& buffer) -> object
 
    Wrap the Arrow C++ resizable *buffer* in a Python :class:`pyarrow.ResizableBuffer` instance.
 
-.. function:: pyarrow_wrap_schema(sp_array: const shared_ptr[CSchema]& schema) -> object
+.. function:: pyarrow_wrap_scalar(const shared_ptr[CScalar]& scalar) -> object
+
+   Wrap the Arrow C++ *scalar* in a Python :class:`pyarrow.Scalar` instance.
+
+.. function:: pyarrow_wrap_schema(const shared_ptr[CSchema]& schema) -> object
 
    Wrap the Arrow C++ *schema* in a Python :class:`pyarrow.Schema` instance.
 
-.. function:: pyarrow_wrap_table(sp_array: const shared_ptr[CTable]& table) -> object
+.. function:: pyarrow_wrap_table(const shared_ptr[CTable]& table) -> object
 
    Wrap the Arrow C++ *table* in a Python :class:`pyarrow.Table` instance.
 
-.. function:: pyarrow_wrap_tensor(sp_array: const shared_ptr[CTensor]& tensor) -> object
+.. function:: pyarrow_wrap_tensor(const shared_ptr[CTensor]& tensor) -> object
 
    Wrap the Arrow C++ *tensor* in a Python :class:`pyarrow.Tensor` instance.
 
-.. function:: pyarrow_wrap_sparse_coo_tensor(sp_array: const shared_ptr[CSparseCOOTensor]& sparse_tensor) -> object
+.. function:: pyarrow_wrap_sparse_coo_tensor(const shared_ptr[CSparseCOOTensor]& sparse_tensor) -> object
 
    Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseCOOTensor` instance.
 
-.. function:: pyarrow_wrap_sparse_csr_matrix(sp_array: const shared_ptr[CSparseCSRMatrix]& sparse_tensor) -> object
+.. function:: pyarrow_wrap_sparse_csc_matrix(const shared_ptr[CSparseCSCMatrix]& sparse_tensor) -> object
 
-   Wrap the Arrow C++ *CSR sparse tensor* in a Python :class:`pyarrow.SparseCSRMatrix` instance.
+   Wrap the Arrow C++ *CSC sparse tensor* in a Python :class:`pyarrow.SparseCSCMatrix` instance.
 
-.. function:: pyarrow_wrap_sparse_csc_matrix(sp_array: const shared_ptr[CSparseCSCMatrix]& sparse_tensor) -> object
+.. function:: pyarrow_wrap_sparse_csf_tensor(const shared_ptr[CSparseCSFTensor]& sparse_tensor) -> object
+
+   Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseCSFTensor` instance.
+
+.. function:: pyarrow_wrap_sparse_csr_matrix(const shared_ptr[CSparseCSRMatrix]& sparse_tensor) -> object
+
+   Wrap the Arrow C++ *CSR sparse tensor* in a Python :class:`pyarrow.SparseCSRMatrix` instance.
 
-   Wrap the Arrow C++ *CSC sparse tensor* in a Python :class:`pyarrow.SparseCSCMatrix` instance.
 
 Example
 ~~~~~~~
@@ -368,7 +416,7 @@ To build this module, you will need a slightly customized ``setup.py`` file
 
 .. code-block:: python
 
-    from distutils.core import setup
+    from setuptools import setup
     from Cython.Build import cythonize
 
     import os
diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst
index 5f93f635517..90c9b81f9d3 100644
--- a/docs/source/python/pandas.rst
+++ b/docs/source/python/pandas.rst
@@ -293,3 +293,17 @@ Used together, the call
 
 will yield significantly lower memory usage in some scenarios. Without these
 options, ``to_pandas`` will always double memory.
+
+Note that ``self_destruct=True`` is not guaranteed to save memory. Since the
+conversion happens column by column, memory is also freed column by column. But
+if multiple columns share an underlying buffer, then no memory will be freed
+until all of those columns are converted. In particular, due to implementation
+details, data that comes from IPC or Flight is prone to this, as memory will be
+laid out as follows::
+
+  Record Batch 0: Allocation 0: array 0 chunk 0, array 1 chunk 0, ...
+  Record Batch 1: Allocation 1: array 0 chunk 1, array 1 chunk 1, ...
+  ...
+
+In this case, no memory can be freed until the entire table is converted, even
+with ``self_destruct=True``.
diff --git a/docs/source/status.rst b/docs/source/status.rst
index 92c813a8541..1c6262274e6 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -42,9 +42,9 @@ Data Types
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
 | Float32/64        | ✓     | ✓     | ✓     | ✓          |  ✓    |  ✓    | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
-| Decimal128        | ✓     | ✓     |       |            |       |  ✓    | ✓     |
+| Decimal128        | ✓     | ✓     |       |            |  ✓    |  ✓    | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
-| Decimal256        | ✓     | ✓     |       |            |       |       | ✓     |
+| Decimal256        | ✓     | ✓     |       |            |  ✓    |       | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
 | Date32/64         | ✓     | ✓     | ✓     | ✓          |  ✓    |  ✓    | ✓     |
 +-------------------+-------+-------+-------+------------+-------+-------+-------+
@@ -126,7 +126,7 @@ IPC Format
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Sparse tensors              | ✓     |       |       |            |       |       |       |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
-| Buffer compression          | ✓     |       |       |            |       |       | ✓     |
+| Buffer compression          | ✓     | ✓ (3) |       |            |       |       | ✓     |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Endianness conversion       | ✓ (2) |       |       |            |       |       |       |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
@@ -139,6 +139,8 @@ Notes:
 
 * \(2) Data with non-native endianness can be byte-swapped automatically when reading.
 
+* \(3) LZ4 Codec currently is quite inefficient. ARROW-11901 tracks improving performance.
+
 .. seealso::
    The :ref:`format-ipc` specification.
 
@@ -221,7 +223,7 @@ Third-Party Data Formats
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
 | ORC                         | R       |         |       |            |       |         |       |
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
-| Parquet                     | R/W     |         |       |            |       | R/W (1) |       |
+| Parquet                     | R/W     | R (2)   |       |            |       | R/W (1) |       |
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
 
 Notes:
@@ -231,3 +233,5 @@ Notes:
 * *W* = Write supported
 
 * \(1) Nested read/write not supported
+
+* \(2) Through JNI bindings to datasets.
diff --git a/go/arrow/bitutil/bitutil.go b/go/arrow/bitutil/bitutil.go
index b547d618a5a..a75167442d5 100644
--- a/go/arrow/bitutil/bitutil.go
+++ b/go/arrow/bitutil/bitutil.go
@@ -30,6 +30,9 @@ var (
 // IsMultipleOf8 returns whether v is a multiple of 8.
 func IsMultipleOf8(v int64) bool { return v&7 == 0 }
 
+// IsMultipleOf64 returns whether v is a multiple of 64
+func IsMultipleOf64(v int64) bool { return v&63 == 0 }
+
 func BytesForBits(bits int64) int64 { return (bits + 7) >> 3 }
 
 // NextPowerOf2 rounds x to the next power of two.
diff --git a/go/arrow/internal/flatbuf/Block.go b/go/arrow/internal/flatbuf/Block.go
index 91727789828..57a697b1968 100644
--- a/go/arrow/internal/flatbuf/Block.go
+++ b/go/arrow/internal/flatbuf/Block.go
@@ -54,12 +54,12 @@ func (rcv *Block) MutateMetaDataLength(n int32) bool {
 }
 
 /// Length of the data (this is aligned so there can be a gap between this and
-/// the metatdata).
+/// the metadata).
 func (rcv *Block) BodyLength() int64 {
 	return rcv._tab.GetInt64(rcv._tab.Pos + flatbuffers.UOffsetT(16))
 }
 /// Length of the data (this is aligned so there can be a gap between this and
-/// the metatdata).
+/// the metadata).
 func (rcv *Block) MutateBodyLength(n int64) bool {
 	return rcv._tab.MutateInt64(rcv._tab.Pos+flatbuffers.UOffsetT(16), n)
 }
diff --git a/go/arrow/internal/flatbuf/BodyCompression.go b/go/arrow/internal/flatbuf/BodyCompression.go
new file mode 100644
index 00000000000..a0efeb1ba9d
--- /dev/null
+++ b/go/arrow/internal/flatbuf/BodyCompression.go
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+type BodyCompression struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsBodyCompression(buf []byte, offset flatbuffers.UOffsetT) *BodyCompression {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &BodyCompression{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func (rcv *BodyCompression) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *BodyCompression) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+/// Compressor library
+func (rcv *BodyCompression) Codec() CompressionType {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return CompressionType(rcv._tab.GetInt8(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+/// Compressor library
+func (rcv *BodyCompression) MutateCodec(n CompressionType) bool {
+	return rcv._tab.MutateInt8Slot(4, int8(n))
+}
+
+/// Indicates the way the record batch body was compressed
+func (rcv *BodyCompression) Method() BodyCompressionMethod {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		return BodyCompressionMethod(rcv._tab.GetInt8(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+/// Indicates the way the record batch body was compressed
+func (rcv *BodyCompression) MutateMethod(n BodyCompressionMethod) bool {
+	return rcv._tab.MutateInt8Slot(6, int8(n))
+}
+
+func BodyCompressionStart(builder *flatbuffers.Builder) {
+	builder.StartObject(2)
+}
+func BodyCompressionAddCodec(builder *flatbuffers.Builder, codec CompressionType) {
+	builder.PrependInt8Slot(0, int8(codec), 0)
+}
+func BodyCompressionAddMethod(builder *flatbuffers.Builder, method BodyCompressionMethod) {
+	builder.PrependInt8Slot(1, int8(method), 0)
+}
+func BodyCompressionEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/go/arrow/internal/flatbuf/BodyCompressionMethod.go b/go/arrow/internal/flatbuf/BodyCompressionMethod.go
new file mode 100644
index 00000000000..108ab3e07fb
--- /dev/null
+++ b/go/arrow/internal/flatbuf/BodyCompressionMethod.go
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import "strconv"
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+type BodyCompressionMethod int8
+
+const (
+	/// Each constituent buffer is first compressed with the indicated
+	/// compressor, and then written with the uncompressed length in the first 8
+	/// bytes as a 64-bit little-endian signed integer followed by the compressed
+	/// buffer bytes (and then padding as required by the protocol). The
+	/// uncompressed length may be set to -1 to indicate that the data that
+	/// follows is not compressed, which can be useful for cases where
+	/// compression does not yield appreciable savings.
+	BodyCompressionMethodBUFFER BodyCompressionMethod = 0
+)
+
+var EnumNamesBodyCompressionMethod = map[BodyCompressionMethod]string{
+	BodyCompressionMethodBUFFER: "BUFFER",
+}
+
+var EnumValuesBodyCompressionMethod = map[string]BodyCompressionMethod{
+	"BUFFER": BodyCompressionMethodBUFFER,
+}
+
+func (v BodyCompressionMethod) String() string {
+	if s, ok := EnumNamesBodyCompressionMethod[v]; ok {
+		return s
+	}
+	return "BodyCompressionMethod(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/CompressionType.go b/go/arrow/internal/flatbuf/CompressionType.go
new file mode 100644
index 00000000000..96e9df0721c
--- /dev/null
+++ b/go/arrow/internal/flatbuf/CompressionType.go
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import "strconv"
+
+type CompressionType int8
+
+const (
+	CompressionTypeLZ4_FRAME CompressionType = 0
+	CompressionTypeZSTD      CompressionType = 1
+)
+
+var EnumNamesCompressionType = map[CompressionType]string{
+	CompressionTypeLZ4_FRAME: "LZ4_FRAME",
+	CompressionTypeZSTD:      "ZSTD",
+}
+
+var EnumValuesCompressionType = map[string]CompressionType{
+	"LZ4_FRAME": CompressionTypeLZ4_FRAME,
+	"ZSTD":      CompressionTypeZSTD,
+}
+
+func (v CompressionType) String() string {
+	if s, ok := EnumNamesCompressionType[v]; ok {
+		return s
+	}
+	return "CompressionType(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Date.go b/go/arrow/internal/flatbuf/Date.go
index 476aa03ce7e..fc1913252aa 100644
--- a/go/arrow/internal/flatbuf/Date.go
+++ b/go/arrow/internal/flatbuf/Date.go
@@ -51,20 +51,20 @@ func (rcv *Date) Table() flatbuffers.Table {
 func (rcv *Date) Unit() DateUnit {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return DateUnit(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 1
 }
 
 func (rcv *Date) MutateUnit(n DateUnit) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func DateStart(builder *flatbuffers.Builder) {
 	builder.StartObject(1)
 }
-func DateAddUnit(builder *flatbuffers.Builder, unit int16) {
-	builder.PrependInt16Slot(0, unit, 1)
+func DateAddUnit(builder *flatbuffers.Builder, unit DateUnit) {
+	builder.PrependInt16Slot(0, int16(unit), 1)
 }
 func DateEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
diff --git a/go/arrow/internal/flatbuf/DateUnit.go b/go/arrow/internal/flatbuf/DateUnit.go
index bb49f287e68..8a12eec175f 100644
--- a/go/arrow/internal/flatbuf/DateUnit.go
+++ b/go/arrow/internal/flatbuf/DateUnit.go
@@ -18,14 +18,28 @@
 
 package flatbuf
 
-type DateUnit = int16
+import "strconv"
+
+type DateUnit int16
+
 const (
-	DateUnitDAY DateUnit = 0
+	DateUnitDAY         DateUnit = 0
 	DateUnitMILLISECOND DateUnit = 1
 )
 
 var EnumNamesDateUnit = map[DateUnit]string{
-	DateUnitDAY:"DAY",
-	DateUnitMILLISECOND:"MILLISECOND",
+	DateUnitDAY:         "DAY",
+	DateUnitMILLISECOND: "MILLISECOND",
 }
 
+var EnumValuesDateUnit = map[string]DateUnit{
+	"DAY":         DateUnitDAY,
+	"MILLISECOND": DateUnitMILLISECOND,
+}
+
+func (v DateUnit) String() string {
+	if s, ok := EnumNamesDateUnit[v]; ok {
+		return s
+	}
+	return "DateUnit(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Decimal.go b/go/arrow/internal/flatbuf/Decimal.go
index a0a9b5019c5..c9de254d1dc 100644
--- a/go/arrow/internal/flatbuf/Decimal.go
+++ b/go/arrow/internal/flatbuf/Decimal.go
@@ -22,6 +22,10 @@ import (
 	flatbuffers "github.com/google/flatbuffers/go"
 )
 
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
 type Decimal struct {
 	_tab flatbuffers.Table
 }
@@ -70,8 +74,24 @@ func (rcv *Decimal) MutateScale(n int32) bool {
 	return rcv._tab.MutateInt32Slot(6, n)
 }
 
+/// Number of bits per value. The only accepted widths are 128 and 256.
+/// We use bitWidth for consistency with Int::bitWidth.
+func (rcv *Decimal) BitWidth() int32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
+	if o != 0 {
+		return rcv._tab.GetInt32(o + rcv._tab.Pos)
+	}
+	return 128
+}
+
+/// Number of bits per value. The only accepted widths are 128 and 256.
+/// We use bitWidth for consistency with Int::bitWidth.
+func (rcv *Decimal) MutateBitWidth(n int32) bool {
+	return rcv._tab.MutateInt32Slot(8, n)
+}
+
 func DecimalStart(builder *flatbuffers.Builder) {
-	builder.StartObject(2)
+	builder.StartObject(3)
 }
 func DecimalAddPrecision(builder *flatbuffers.Builder, precision int32) {
 	builder.PrependInt32Slot(0, precision, 0)
@@ -79,6 +99,9 @@ func DecimalAddPrecision(builder *flatbuffers.Builder, precision int32) {
 func DecimalAddScale(builder *flatbuffers.Builder, scale int32) {
 	builder.PrependInt32Slot(1, scale, 0)
 }
+func DecimalAddBitWidth(builder *flatbuffers.Builder, bitWidth int32) {
+	builder.PrependInt32Slot(2, bitWidth, 128)
+}
 func DecimalEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
 }
diff --git a/go/arrow/internal/flatbuf/DictionaryBatch.go b/go/arrow/internal/flatbuf/DictionaryBatch.go
index 1cd63106304..25b5384e46a 100644
--- a/go/arrow/internal/flatbuf/DictionaryBatch.go
+++ b/go/arrow/internal/flatbuf/DictionaryBatch.go
@@ -74,7 +74,8 @@ func (rcv *DictionaryBatch) Data(obj *RecordBatch) *RecordBatch {
 }
 
 /// If isDelta is true the values in the dictionary are to be appended to a
-/// dictionary with the indicated id
+/// dictionary with the indicated id. If isDelta is false this dictionary
+/// should replace the existing dictionary.
 func (rcv *DictionaryBatch) IsDelta() bool {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
 	if o != 0 {
@@ -84,7 +85,8 @@ func (rcv *DictionaryBatch) IsDelta() bool {
 }
 
 /// If isDelta is true the values in the dictionary are to be appended to a
-/// dictionary with the indicated id
+/// dictionary with the indicated id. If isDelta is false this dictionary
+/// should replace the existing dictionary.
 func (rcv *DictionaryBatch) MutateIsDelta(n bool) bool {
 	return rcv._tab.MutateBoolSlot(8, n)
 }
diff --git a/go/arrow/internal/flatbuf/DictionaryEncoding.go b/go/arrow/internal/flatbuf/DictionaryEncoding.go
index 592b6fcf924..a9b09530b2a 100644
--- a/go/arrow/internal/flatbuf/DictionaryEncoding.go
+++ b/go/arrow/internal/flatbuf/DictionaryEncoding.go
@@ -22,8 +22,6 @@ import (
 	flatbuffers "github.com/google/flatbuffers/go"
 )
 
-/// ----------------------------------------------------------------------
-/// Dictionary encoding metadata
 type DictionaryEncoding struct {
 	_tab flatbuffers.Table
 }
@@ -62,8 +60,11 @@ func (rcv *DictionaryEncoding) MutateId(n int64) bool {
 	return rcv._tab.MutateInt64Slot(4, n)
 }
 
-/// The dictionary indices are constrained to be positive integers. If this
-/// field is null, the indices must be signed int32
+/// The dictionary indices are constrained to be non-negative integers. If
+/// this field is null, the indices must be signed int32. To maximize
+/// cross-language compatibility and performance, implementations are
+/// recommended to prefer signed integer types over unsigned integer types
+/// and to avoid uint64 indices unless they are required by an application.
 func (rcv *DictionaryEncoding) IndexType(obj *Int) *Int {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
 	if o != 0 {
@@ -77,8 +78,11 @@ func (rcv *DictionaryEncoding) IndexType(obj *Int) *Int {
 	return nil
 }
 
-/// The dictionary indices are constrained to be positive integers. If this
-/// field is null, the indices must be signed int32
+/// The dictionary indices are constrained to be non-negative integers. If
+/// this field is null, the indices must be signed int32. To maximize
+/// cross-language compatibility and performance, implementations are
+/// recommended to prefer signed integer types over unsigned integer types
+/// and to avoid uint64 indices unless they are required by an application.
 /// By default, dictionaries are not ordered, or the order does not have
 /// semantic meaning. In some statistical, applications, dictionary-encoding
 /// is used to represent ordered categorical data, and we provide a way to
@@ -99,8 +103,20 @@ func (rcv *DictionaryEncoding) MutateIsOrdered(n bool) bool {
 	return rcv._tab.MutateBoolSlot(8, n)
 }
 
+func (rcv *DictionaryEncoding) DictionaryKind() DictionaryKind {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		return DictionaryKind(rcv._tab.GetInt16(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+func (rcv *DictionaryEncoding) MutateDictionaryKind(n DictionaryKind) bool {
+	return rcv._tab.MutateInt16Slot(10, int16(n))
+}
+
 func DictionaryEncodingStart(builder *flatbuffers.Builder) {
-	builder.StartObject(3)
+	builder.StartObject(4)
 }
 func DictionaryEncodingAddId(builder *flatbuffers.Builder, id int64) {
 	builder.PrependInt64Slot(0, id, 0)
@@ -111,6 +127,9 @@ func DictionaryEncodingAddIndexType(builder *flatbuffers.Builder, indexType flat
 func DictionaryEncodingAddIsOrdered(builder *flatbuffers.Builder, isOrdered bool) {
 	builder.PrependBoolSlot(2, isOrdered, false)
 }
+func DictionaryEncodingAddDictionaryKind(builder *flatbuffers.Builder, dictionaryKind DictionaryKind) {
+	builder.PrependInt16Slot(3, int16(dictionaryKind), 0)
+}
 func DictionaryEncodingEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
 }
diff --git a/go/arrow/internal/flatbuf/DictionaryKind.go b/go/arrow/internal/flatbuf/DictionaryKind.go
new file mode 100644
index 00000000000..126ba5f7f6b
--- /dev/null
+++ b/go/arrow/internal/flatbuf/DictionaryKind.go
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import "strconv"
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+type DictionaryKind int16
+
+const (
+	DictionaryKindDenseArray DictionaryKind = 0
+)
+
+var EnumNamesDictionaryKind = map[DictionaryKind]string{
+	DictionaryKindDenseArray: "DenseArray",
+}
+
+var EnumValuesDictionaryKind = map[string]DictionaryKind{
+	"DenseArray": DictionaryKindDenseArray,
+}
+
+func (v DictionaryKind) String() string {
+	if s, ok := EnumNamesDictionaryKind[v]; ok {
+		return s
+	}
+	return "DictionaryKind(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Duration.go b/go/arrow/internal/flatbuf/Duration.go
index 4bb44d2ec03..57b7b2a037f 100644
--- a/go/arrow/internal/flatbuf/Duration.go
+++ b/go/arrow/internal/flatbuf/Duration.go
@@ -45,20 +45,20 @@ func (rcv *Duration) Table() flatbuffers.Table {
 func (rcv *Duration) Unit() TimeUnit {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return TimeUnit(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 1
 }
 
 func (rcv *Duration) MutateUnit(n TimeUnit) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func DurationStart(builder *flatbuffers.Builder) {
 	builder.StartObject(1)
 }
-func DurationAddUnit(builder *flatbuffers.Builder, unit int16) {
-	builder.PrependInt16Slot(0, unit, 1)
+func DurationAddUnit(builder *flatbuffers.Builder, unit TimeUnit) {
+	builder.PrependInt16Slot(0, int16(unit), 1)
 }
 func DurationEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
diff --git a/go/arrow/internal/flatbuf/Endianness.go b/go/arrow/internal/flatbuf/Endianness.go
index 22e349e2798..cefa2ff9c06 100644
--- a/go/arrow/internal/flatbuf/Endianness.go
+++ b/go/arrow/internal/flatbuf/Endianness.go
@@ -18,16 +18,30 @@
 
 package flatbuf
 
+import "strconv"
+
 /// ----------------------------------------------------------------------
 /// Endianness of the platform producing the data
-type Endianness = int16
+type Endianness int16
+
 const (
 	EndiannessLittle Endianness = 0
-	EndiannessBig Endianness = 1
+	EndiannessBig    Endianness = 1
 )
 
 var EnumNamesEndianness = map[Endianness]string{
-	EndiannessLittle:"Little",
-	EndiannessBig:"Big",
+	EndiannessLittle: "Little",
+	EndiannessBig:    "Big",
 }
 
+var EnumValuesEndianness = map[string]Endianness{
+	"Little": EndiannessLittle,
+	"Big":    EndiannessBig,
+}
+
+func (v Endianness) String() string {
+	if s, ok := EnumNamesEndianness[v]; ok {
+		return s
+	}
+	return "Endianness(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Feature.go b/go/arrow/internal/flatbuf/Feature.go
new file mode 100644
index 00000000000..ae5a0398b60
--- /dev/null
+++ b/go/arrow/internal/flatbuf/Feature.go
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import "strconv"
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intented to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+type Feature int64
+
+const (
+	/// Needed to make flatbuffers happy.
+	FeatureUNUSED                 Feature = 0
+	/// The stream makes use of multiple full dictionaries with the
+	/// same ID and assumes clients implement dictionary replacement
+	/// correctly.
+	FeatureDICTIONARY_REPLACEMENT Feature = 1
+	/// The stream makes use of compressed bodies as described
+	/// in Message.fbs.
+	FeatureCOMPRESSED_BODY        Feature = 2
+)
+
+var EnumNamesFeature = map[Feature]string{
+	FeatureUNUSED:                 "UNUSED",
+	FeatureDICTIONARY_REPLACEMENT: "DICTIONARY_REPLACEMENT",
+	FeatureCOMPRESSED_BODY:        "COMPRESSED_BODY",
+}
+
+var EnumValuesFeature = map[string]Feature{
+	"UNUSED":                 FeatureUNUSED,
+	"DICTIONARY_REPLACEMENT": FeatureDICTIONARY_REPLACEMENT,
+	"COMPRESSED_BODY":        FeatureCOMPRESSED_BODY,
+}
+
+func (v Feature) String() string {
+	if s, ok := EnumNamesFeature[v]; ok {
+		return s
+	}
+	return "Feature(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Field.go b/go/arrow/internal/flatbuf/Field.go
index d63df246b71..c03cf2f878b 100644
--- a/go/arrow/internal/flatbuf/Field.go
+++ b/go/arrow/internal/flatbuf/Field.go
@@ -69,16 +69,16 @@ func (rcv *Field) MutateNullable(n bool) bool {
 	return rcv._tab.MutateBoolSlot(6, n)
 }
 
-func (rcv *Field) TypeType() byte {
+func (rcv *Field) TypeType() Type {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
 	if o != 0 {
-		return rcv._tab.GetByte(o + rcv._tab.Pos)
+		return Type(rcv._tab.GetByte(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
-func (rcv *Field) MutateTypeType(n byte) bool {
-	return rcv._tab.MutateByteSlot(8, n)
+func (rcv *Field) MutateTypeType(n Type) bool {
+	return rcv._tab.MutateByteSlot(8, byte(n))
 }
 
 /// This is the type of the decoded value if the field is dictionary encoded.
@@ -162,8 +162,8 @@ func FieldAddName(builder *flatbuffers.Builder, name flatbuffers.UOffsetT) {
 func FieldAddNullable(builder *flatbuffers.Builder, nullable bool) {
 	builder.PrependBoolSlot(1, nullable, false)
 }
-func FieldAddTypeType(builder *flatbuffers.Builder, typeType byte) {
-	builder.PrependByteSlot(2, typeType, 0)
+func FieldAddTypeType(builder *flatbuffers.Builder, typeType Type) {
+	builder.PrependByteSlot(2, byte(typeType), 0)
 }
 func FieldAddType(builder *flatbuffers.Builder, type_ flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(type_), 0)
diff --git a/go/arrow/internal/flatbuf/FieldNode.go b/go/arrow/internal/flatbuf/FieldNode.go
index b8d711540ab..606b30bfebb 100644
--- a/go/arrow/internal/flatbuf/FieldNode.go
+++ b/go/arrow/internal/flatbuf/FieldNode.go
@@ -28,7 +28,7 @@ import (
 /// Metadata about a field at some level of a nested type tree (but not
 /// its children).
 ///
-/// For example, a List<Int16> with values [[1, 2, 3], null, [4], [5, 6], null]
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
 /// would have {length: 5, null_count: 2} for its List node, and {length: 6,
 /// null_count: 0} for its Int16 node, as separate FieldNode structs
 type FieldNode struct {
diff --git a/go/arrow/internal/flatbuf/FloatingPoint.go b/go/arrow/internal/flatbuf/FloatingPoint.go
index 975a52dbe0a..241d448dcf9 100644
--- a/go/arrow/internal/flatbuf/FloatingPoint.go
+++ b/go/arrow/internal/flatbuf/FloatingPoint.go
@@ -45,20 +45,20 @@ func (rcv *FloatingPoint) Table() flatbuffers.Table {
 func (rcv *FloatingPoint) Precision() Precision {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return Precision(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
 func (rcv *FloatingPoint) MutatePrecision(n Precision) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func FloatingPointStart(builder *flatbuffers.Builder) {
 	builder.StartObject(1)
 }
-func FloatingPointAddPrecision(builder *flatbuffers.Builder, precision int16) {
-	builder.PrependInt16Slot(0, precision, 0)
+func FloatingPointAddPrecision(builder *flatbuffers.Builder, precision Precision) {
+	builder.PrependInt16Slot(0, int16(precision), 0)
 }
 func FloatingPointEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
diff --git a/go/arrow/internal/flatbuf/Footer.go b/go/arrow/internal/flatbuf/Footer.go
index 6802e77c032..65b0ff09546 100644
--- a/go/arrow/internal/flatbuf/Footer.go
+++ b/go/arrow/internal/flatbuf/Footer.go
@@ -48,13 +48,13 @@ func (rcv *Footer) Table() flatbuffers.Table {
 func (rcv *Footer) Version() MetadataVersion {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return MetadataVersion(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
 func (rcv *Footer) MutateVersion(n MetadataVersion) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func (rcv *Footer) Schema(obj *Schema) *Schema {
@@ -108,11 +108,33 @@ func (rcv *Footer) RecordBatchesLength() int {
 	return 0
 }
 
+/// User-defined metadata
+func (rcv *Footer) CustomMetadata(obj *KeyValue, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 4
+		x = rcv._tab.Indirect(x)
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Footer) CustomMetadataLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// User-defined metadata
 func FooterStart(builder *flatbuffers.Builder) {
-	builder.StartObject(4)
+	builder.StartObject(5)
 }
-func FooterAddVersion(builder *flatbuffers.Builder, version int16) {
-	builder.PrependInt16Slot(0, version, 0)
+func FooterAddVersion(builder *flatbuffers.Builder, version MetadataVersion) {
+	builder.PrependInt16Slot(0, int16(version), 0)
 }
 func FooterAddSchema(builder *flatbuffers.Builder, schema flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(schema), 0)
@@ -129,6 +151,12 @@ func FooterAddRecordBatches(builder *flatbuffers.Builder, recordBatches flatbuff
 func FooterStartRecordBatchesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
 	return builder.StartVector(24, numElems, 8)
 }
+func FooterAddCustomMetadata(builder *flatbuffers.Builder, customMetadata flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(4, flatbuffers.UOffsetT(customMetadata), 0)
+}
+func FooterStartCustomMetadataVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
 func FooterEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
 }
diff --git a/go/arrow/internal/flatbuf/Interval.go b/go/arrow/internal/flatbuf/Interval.go
index 21970ef1080..12c56d5c210 100644
--- a/go/arrow/internal/flatbuf/Interval.go
+++ b/go/arrow/internal/flatbuf/Interval.go
@@ -45,20 +45,20 @@ func (rcv *Interval) Table() flatbuffers.Table {
 func (rcv *Interval) Unit() IntervalUnit {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return IntervalUnit(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
 func (rcv *Interval) MutateUnit(n IntervalUnit) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func IntervalStart(builder *flatbuffers.Builder) {
 	builder.StartObject(1)
 }
-func IntervalAddUnit(builder *flatbuffers.Builder, unit int16) {
-	builder.PrependInt16Slot(0, unit, 0)
+func IntervalAddUnit(builder *flatbuffers.Builder, unit IntervalUnit) {
+	builder.PrependInt16Slot(0, int16(unit), 0)
 }
 func IntervalEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
diff --git a/go/arrow/internal/flatbuf/IntervalUnit.go b/go/arrow/internal/flatbuf/IntervalUnit.go
index 62baea99117..5c6aeec3e79 100644
--- a/go/arrow/internal/flatbuf/IntervalUnit.go
+++ b/go/arrow/internal/flatbuf/IntervalUnit.go
@@ -18,14 +18,28 @@
 
 package flatbuf
 
-type IntervalUnit = int16
+import "strconv"
+
+type IntervalUnit int16
+
 const (
 	IntervalUnitYEAR_MONTH IntervalUnit = 0
-	IntervalUnitDAY_TIME IntervalUnit = 1
+	IntervalUnitDAY_TIME   IntervalUnit = 1
 )
 
 var EnumNamesIntervalUnit = map[IntervalUnit]string{
-	IntervalUnitYEAR_MONTH:"YEAR_MONTH",
-	IntervalUnitDAY_TIME:"DAY_TIME",
+	IntervalUnitYEAR_MONTH: "YEAR_MONTH",
+	IntervalUnitDAY_TIME:   "DAY_TIME",
 }
 
+var EnumValuesIntervalUnit = map[string]IntervalUnit{
+	"YEAR_MONTH": IntervalUnitYEAR_MONTH,
+	"DAY_TIME":   IntervalUnitDAY_TIME,
+}
+
+func (v IntervalUnit) String() string {
+	if s, ok := EnumNamesIntervalUnit[v]; ok {
+		return s
+	}
+	return "IntervalUnit(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/LargeBinary.go b/go/arrow/internal/flatbuf/LargeBinary.go
new file mode 100644
index 00000000000..2c3befcc16f
--- /dev/null
+++ b/go/arrow/internal/flatbuf/LargeBinary.go
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+type LargeBinary struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsLargeBinary(buf []byte, offset flatbuffers.UOffsetT) *LargeBinary {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &LargeBinary{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func (rcv *LargeBinary) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *LargeBinary) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func LargeBinaryStart(builder *flatbuffers.Builder) {
+	builder.StartObject(0)
+}
+func LargeBinaryEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/go/arrow/internal/flatbuf/LargeList.go b/go/arrow/internal/flatbuf/LargeList.go
new file mode 100644
index 00000000000..92f22845874
--- /dev/null
+++ b/go/arrow/internal/flatbuf/LargeList.go
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+type LargeList struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsLargeList(buf []byte, offset flatbuffers.UOffsetT) *LargeList {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &LargeList{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func (rcv *LargeList) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *LargeList) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func LargeListStart(builder *flatbuffers.Builder) {
+	builder.StartObject(0)
+}
+func LargeListEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/go/arrow/internal/flatbuf/LargeUtf8.go b/go/arrow/internal/flatbuf/LargeUtf8.go
new file mode 100644
index 00000000000..e78b33e1100
--- /dev/null
+++ b/go/arrow/internal/flatbuf/LargeUtf8.go
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+type LargeUtf8 struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsLargeUtf8(buf []byte, offset flatbuffers.UOffsetT) *LargeUtf8 {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &LargeUtf8{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func (rcv *LargeUtf8) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *LargeUtf8) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func LargeUtf8Start(builder *flatbuffers.Builder) {
+	builder.StartObject(0)
+}
+func LargeUtf8End(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/go/arrow/internal/flatbuf/Map.go b/go/arrow/internal/flatbuf/Map.go
index d540fc75703..8802aba1ebd 100644
--- a/go/arrow/internal/flatbuf/Map.go
+++ b/go/arrow/internal/flatbuf/Map.go
@@ -37,10 +37,11 @@ import (
 /// not enforced.
 ///
 /// Map
+/// ```text
 ///   - child[0] entries: Struct
 ///     - child[0] key: K
 ///     - child[1] value: V
-///
+/// ```
 /// Neither the "entries" field nor the "key" field may be nullable.
 ///
 /// The metadata is structured so that Arrow systems without special handling
diff --git a/go/arrow/internal/flatbuf/Message.go b/go/arrow/internal/flatbuf/Message.go
index 76fcc389d1d..f4b4a0ff80e 100644
--- a/go/arrow/internal/flatbuf/Message.go
+++ b/go/arrow/internal/flatbuf/Message.go
@@ -45,25 +45,25 @@ func (rcv *Message) Table() flatbuffers.Table {
 func (rcv *Message) Version() MetadataVersion {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return MetadataVersion(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
 func (rcv *Message) MutateVersion(n MetadataVersion) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
-func (rcv *Message) HeaderType() byte {
+func (rcv *Message) HeaderType() MessageHeader {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
 	if o != 0 {
-		return rcv._tab.GetByte(o + rcv._tab.Pos)
+		return MessageHeader(rcv._tab.GetByte(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
-func (rcv *Message) MutateHeaderType(n byte) bool {
-	return rcv._tab.MutateByteSlot(6, n)
+func (rcv *Message) MutateHeaderType(n MessageHeader) bool {
+	return rcv._tab.MutateByteSlot(6, byte(n))
 }
 
 func (rcv *Message) Header(obj *flatbuffers.Table) bool {
@@ -110,11 +110,11 @@ func (rcv *Message) CustomMetadataLength() int {
 func MessageStart(builder *flatbuffers.Builder) {
 	builder.StartObject(5)
 }
-func MessageAddVersion(builder *flatbuffers.Builder, version int16) {
-	builder.PrependInt16Slot(0, version, 0)
+func MessageAddVersion(builder *flatbuffers.Builder, version MetadataVersion) {
+	builder.PrependInt16Slot(0, int16(version), 0)
 }
-func MessageAddHeaderType(builder *flatbuffers.Builder, headerType byte) {
-	builder.PrependByteSlot(1, headerType, 0)
+func MessageAddHeaderType(builder *flatbuffers.Builder, headerType MessageHeader) {
+	builder.PrependByteSlot(1, byte(headerType), 0)
 }
 func MessageAddHeader(builder *flatbuffers.Builder, header flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(2, flatbuffers.UOffsetT(header), 0)
diff --git a/go/arrow/internal/flatbuf/MessageHeader.go b/go/arrow/internal/flatbuf/MessageHeader.go
index 18730bed59f..c12fc105811 100644
--- a/go/arrow/internal/flatbuf/MessageHeader.go
+++ b/go/arrow/internal/flatbuf/MessageHeader.go
@@ -18,6 +18,8 @@
 
 package flatbuf
 
+import "strconv"
+
 /// ----------------------------------------------------------------------
 /// The root Message type
 /// This union enables us to easily send different message types without
@@ -26,22 +28,38 @@ package flatbuf
 /// Arrow implementations do not need to implement all of the message types,
 /// which may include experimental metadata types. For maximum compatibility,
 /// it is best to send data using RecordBatch
-type MessageHeader = byte
+type MessageHeader byte
+
 const (
-	MessageHeaderNONE MessageHeader = 0
-	MessageHeaderSchema MessageHeader = 1
+	MessageHeaderNONE            MessageHeader = 0
+	MessageHeaderSchema          MessageHeader = 1
 	MessageHeaderDictionaryBatch MessageHeader = 2
-	MessageHeaderRecordBatch MessageHeader = 3
-	MessageHeaderTensor MessageHeader = 4
-	MessageHeaderSparseTensor MessageHeader = 5
+	MessageHeaderRecordBatch     MessageHeader = 3
+	MessageHeaderTensor          MessageHeader = 4
+	MessageHeaderSparseTensor    MessageHeader = 5
 )
 
 var EnumNamesMessageHeader = map[MessageHeader]string{
-	MessageHeaderNONE:"NONE",
-	MessageHeaderSchema:"Schema",
-	MessageHeaderDictionaryBatch:"DictionaryBatch",
-	MessageHeaderRecordBatch:"RecordBatch",
-	MessageHeaderTensor:"Tensor",
-	MessageHeaderSparseTensor:"SparseTensor",
+	MessageHeaderNONE:            "NONE",
+	MessageHeaderSchema:          "Schema",
+	MessageHeaderDictionaryBatch: "DictionaryBatch",
+	MessageHeaderRecordBatch:     "RecordBatch",
+	MessageHeaderTensor:          "Tensor",
+	MessageHeaderSparseTensor:    "SparseTensor",
 }
 
+var EnumValuesMessageHeader = map[string]MessageHeader{
+	"NONE":            MessageHeaderNONE,
+	"Schema":          MessageHeaderSchema,
+	"DictionaryBatch": MessageHeaderDictionaryBatch,
+	"RecordBatch":     MessageHeaderRecordBatch,
+	"Tensor":          MessageHeaderTensor,
+	"SparseTensor":    MessageHeaderSparseTensor,
+}
+
+func (v MessageHeader) String() string {
+	if s, ok := EnumNamesMessageHeader[v]; ok {
+		return s
+	}
+	return "MessageHeader(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/MetadataVersion.go b/go/arrow/internal/flatbuf/MetadataVersion.go
index 0094430c750..21b234f9c2b 100644
--- a/go/arrow/internal/flatbuf/MetadataVersion.go
+++ b/go/arrow/internal/flatbuf/MetadataVersion.go
@@ -18,22 +18,48 @@
 
 package flatbuf
 
-type MetadataVersion = int16
+import "strconv"
+
+type MetadataVersion int16
+
 const (
-	/// 0.1.0
+	/// 0.1.0 (October 2016).
 	MetadataVersionV1 MetadataVersion = 0
-	/// 0.2.0
+	/// 0.2.0 (February 2017). Non-backwards compatible with V1.
 	MetadataVersionV2 MetadataVersion = 1
-	/// 0.3.0 -> 0.7.1
+	/// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
 	MetadataVersionV3 MetadataVersion = 2
-	/// >= 0.8.0
+	/// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
 	MetadataVersionV4 MetadataVersion = 3
+	/// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
+	/// metadata and IPC messages). Implementations are recommended to provide a
+	/// V4 compatibility mode with V5 format changes disabled.
+	///
+	/// Incompatible changes between V4 and V5:
+	/// - Union buffer layout has changed. In V5, Unions don't have a validity
+	///   bitmap buffer.
+	MetadataVersionV5 MetadataVersion = 4
 )
 
 var EnumNamesMetadataVersion = map[MetadataVersion]string{
-	MetadataVersionV1:"V1",
-	MetadataVersionV2:"V2",
-	MetadataVersionV3:"V3",
-	MetadataVersionV4:"V4",
+	MetadataVersionV1: "V1",
+	MetadataVersionV2: "V2",
+	MetadataVersionV3: "V3",
+	MetadataVersionV4: "V4",
+	MetadataVersionV5: "V5",
 }
 
+var EnumValuesMetadataVersion = map[string]MetadataVersion{
+	"V1": MetadataVersionV1,
+	"V2": MetadataVersionV2,
+	"V3": MetadataVersionV3,
+	"V4": MetadataVersionV4,
+	"V5": MetadataVersionV5,
+}
+
+func (v MetadataVersion) String() string {
+	if s, ok := EnumNamesMetadataVersion[v]; ok {
+		return s
+	}
+	return "MetadataVersion(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Precision.go b/go/arrow/internal/flatbuf/Precision.go
index 417d1e788e8..d8021ccc443 100644
--- a/go/arrow/internal/flatbuf/Precision.go
+++ b/go/arrow/internal/flatbuf/Precision.go
@@ -18,16 +18,31 @@
 
 package flatbuf
 
-type Precision = int16
+import "strconv"
+
+type Precision int16
+
 const (
-	PrecisionHALF Precision = 0
+	PrecisionHALF   Precision = 0
 	PrecisionSINGLE Precision = 1
 	PrecisionDOUBLE Precision = 2
 )
 
 var EnumNamesPrecision = map[Precision]string{
-	PrecisionHALF:"HALF",
-	PrecisionSINGLE:"SINGLE",
-	PrecisionDOUBLE:"DOUBLE",
+	PrecisionHALF:   "HALF",
+	PrecisionSINGLE: "SINGLE",
+	PrecisionDOUBLE: "DOUBLE",
 }
 
+var EnumValuesPrecision = map[string]Precision{
+	"HALF":   PrecisionHALF,
+	"SINGLE": PrecisionSINGLE,
+	"DOUBLE": PrecisionDOUBLE,
+}
+
+func (v Precision) String() string {
+	if s, ok := EnumNamesPrecision[v]; ok {
+		return s
+	}
+	return "Precision(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/RecordBatch.go b/go/arrow/internal/flatbuf/RecordBatch.go
index b997bdf16c7..bb6aca9a061 100644
--- a/go/arrow/internal/flatbuf/RecordBatch.go
+++ b/go/arrow/internal/flatbuf/RecordBatch.go
@@ -113,8 +113,23 @@ func (rcv *RecordBatch) BuffersLength() int {
 /// example, most primitive arrays will have 2 buffers, 1 for the validity
 /// bitmap and 1 for the values. For struct arrays, there will only be a
 /// single buffer for the validity (nulls) bitmap
+/// Optional compression of the message body
+func (rcv *RecordBatch) Compression(obj *BodyCompression) *BodyCompression {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(BodyCompression)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+/// Optional compression of the message body
 func RecordBatchStart(builder *flatbuffers.Builder) {
-	builder.StartObject(3)
+	builder.StartObject(4)
 }
 func RecordBatchAddLength(builder *flatbuffers.Builder, length int64) {
 	builder.PrependInt64Slot(0, length, 0)
@@ -131,6 +146,9 @@ func RecordBatchAddBuffers(builder *flatbuffers.Builder, buffers flatbuffers.UOf
 func RecordBatchStartBuffersVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
 	return builder.StartVector(16, numElems, 8)
 }
+func RecordBatchAddCompression(builder *flatbuffers.Builder, compression flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(compression), 0)
+}
 func RecordBatchEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
 }
diff --git a/go/arrow/internal/flatbuf/Schema.go b/go/arrow/internal/flatbuf/Schema.go
index d3e27373b2e..4ee5ecc9e5e 100644
--- a/go/arrow/internal/flatbuf/Schema.go
+++ b/go/arrow/internal/flatbuf/Schema.go
@@ -50,7 +50,7 @@ func (rcv *Schema) Table() flatbuffers.Table {
 func (rcv *Schema) Endianness() Endianness {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return Endianness(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 0
 }
@@ -59,7 +59,7 @@ func (rcv *Schema) Endianness() Endianness {
 /// it is Little Endian by default
 /// if endianness doesn't match the underlying system then the vectors need to be converted
 func (rcv *Schema) MutateEndianness(n Endianness) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func (rcv *Schema) Fields(obj *Field, j int) bool {
@@ -102,11 +102,39 @@ func (rcv *Schema) CustomMetadataLength() int {
 	return 0
 }
 
+/// Features used in the stream/file.
+func (rcv *Schema) Features(j int) Feature {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return Feature(rcv._tab.GetInt64(a + flatbuffers.UOffsetT(j*8)))
+	}
+	return 0
+}
+
+func (rcv *Schema) FeaturesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// Features used in the stream/file.
+func (rcv *Schema) MutateFeatures(j int, n Feature) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateInt64(a+flatbuffers.UOffsetT(j*8), int64(n))
+	}
+	return false
+}
+
 func SchemaStart(builder *flatbuffers.Builder) {
-	builder.StartObject(3)
+	builder.StartObject(4)
 }
-func SchemaAddEndianness(builder *flatbuffers.Builder, endianness int16) {
-	builder.PrependInt16Slot(0, endianness, 0)
+func SchemaAddEndianness(builder *flatbuffers.Builder, endianness Endianness) {
+	builder.PrependInt16Slot(0, int16(endianness), 0)
 }
 func SchemaAddFields(builder *flatbuffers.Builder, fields flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(fields), 0)
@@ -120,6 +148,12 @@ func SchemaAddCustomMetadata(builder *flatbuffers.Builder, customMetadata flatbu
 func SchemaStartCustomMetadataVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
 	return builder.StartVector(4, numElems, 4)
 }
+func SchemaAddFeatures(builder *flatbuffers.Builder, features flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(features), 0)
+}
+func SchemaStartFeaturesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 8)
+}
 func SchemaEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
 }
diff --git a/go/arrow/internal/flatbuf/SparseMatrixCompressedAxis.go b/go/arrow/internal/flatbuf/SparseMatrixCompressedAxis.go
new file mode 100644
index 00000000000..2d86fdef785
--- /dev/null
+++ b/go/arrow/internal/flatbuf/SparseMatrixCompressedAxis.go
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import "strconv"
+
+type SparseMatrixCompressedAxis int16
+
+const (
+	SparseMatrixCompressedAxisRow    SparseMatrixCompressedAxis = 0
+	SparseMatrixCompressedAxisColumn SparseMatrixCompressedAxis = 1
+)
+
+var EnumNamesSparseMatrixCompressedAxis = map[SparseMatrixCompressedAxis]string{
+	SparseMatrixCompressedAxisRow:    "Row",
+	SparseMatrixCompressedAxisColumn: "Column",
+}
+
+var EnumValuesSparseMatrixCompressedAxis = map[string]SparseMatrixCompressedAxis{
+	"Row":    SparseMatrixCompressedAxisRow,
+	"Column": SparseMatrixCompressedAxisColumn,
+}
+
+func (v SparseMatrixCompressedAxis) String() string {
+	if s, ok := EnumNamesSparseMatrixCompressedAxis[v]; ok {
+		return s
+	}
+	return "SparseMatrixCompressedAxis(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go b/go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go
new file mode 100644
index 00000000000..c28cc5d082f
--- /dev/null
+++ b/go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go
@@ -0,0 +1,200 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+/// Compressed Sparse format, that is matrix-specific.
+type SparseMatrixIndexCSX struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsSparseMatrixIndexCSX(buf []byte, offset flatbuffers.UOffsetT) *SparseMatrixIndexCSX {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &SparseMatrixIndexCSX{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func (rcv *SparseMatrixIndexCSX) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *SparseMatrixIndexCSX) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+/// Which axis, row or column, is compressed
+func (rcv *SparseMatrixIndexCSX) CompressedAxis() SparseMatrixCompressedAxis {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return SparseMatrixCompressedAxis(rcv._tab.GetInt16(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+/// Which axis, row or column, is compressed
+func (rcv *SparseMatrixIndexCSX) MutateCompressedAxis(n SparseMatrixCompressedAxis) bool {
+	return rcv._tab.MutateInt16Slot(4, int16(n))
+}
+
+/// The type of values in indptrBuffer
+func (rcv *SparseMatrixIndexCSX) IndptrType(obj *Int) *Int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(Int)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+/// The type of values in indptrBuffer
+/// indptrBuffer stores the location and size of indptr array that
+/// represents the range of the rows.
+/// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
+/// The length of this array is 1 + (the number of rows), and the type
+/// of index value is long.
+///
+/// For example, let X be the following 6x4 matrix:
+/// ```text
+///   X := [[0, 1, 2, 0],
+///         [0, 0, 3, 0],
+///         [0, 4, 0, 5],
+///         [0, 0, 0, 0],
+///         [6, 0, 7, 8],
+///         [0, 9, 0, 0]].
+/// ```
+/// The array of non-zero values in X is:
+/// ```text
+///   values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9].
+/// ```
+/// And the indptr of X is:
+/// ```text
+///   indptr(X) = [0, 2, 3, 5, 5, 8, 10].
+/// ```
+func (rcv *SparseMatrixIndexCSX) IndptrBuffer(obj *Buffer) *Buffer {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
+	if o != 0 {
+		x := o + rcv._tab.Pos
+		if obj == nil {
+			obj = new(Buffer)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+/// indptrBuffer stores the location and size of indptr array that
+/// represents the range of the rows.
+/// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
+/// The length of this array is 1 + (the number of rows), and the type
+/// of index value is long.
+///
+/// For example, let X be the following 6x4 matrix:
+/// ```text
+///   X := [[0, 1, 2, 0],
+///         [0, 0, 3, 0],
+///         [0, 4, 0, 5],
+///         [0, 0, 0, 0],
+///         [6, 0, 7, 8],
+///         [0, 9, 0, 0]].
+/// ```
+/// The array of non-zero values in X is:
+/// ```text
+///   values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9].
+/// ```
+/// And the indptr of X is:
+/// ```text
+///   indptr(X) = [0, 2, 3, 5, 5, 8, 10].
+/// ```
+/// The type of values in indicesBuffer
+func (rcv *SparseMatrixIndexCSX) IndicesType(obj *Int) *Int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(Int)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+/// The type of values in indicesBuffer
+/// indicesBuffer stores the location and size of the array that
+/// contains the column indices of the corresponding non-zero values.
+/// The type of index value is long.
+///
+/// For example, the indices of the above X is:
+/// ```text
+///   indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1].
+/// ```
+/// Note that the indices are sorted in lexicographical order for each row.
+func (rcv *SparseMatrixIndexCSX) IndicesBuffer(obj *Buffer) *Buffer {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		x := o + rcv._tab.Pos
+		if obj == nil {
+			obj = new(Buffer)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+/// indicesBuffer stores the location and size of the array that
+/// contains the column indices of the corresponding non-zero values.
+/// The type of index value is long.
+///
+/// For example, the indices of the above X is:
+/// ```text
+///   indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1].
+/// ```
+/// Note that the indices are sorted in lexicographical order for each row.
+func SparseMatrixIndexCSXStart(builder *flatbuffers.Builder) {
+	builder.StartObject(5)
+}
+func SparseMatrixIndexCSXAddCompressedAxis(builder *flatbuffers.Builder, compressedAxis SparseMatrixCompressedAxis) {
+	builder.PrependInt16Slot(0, int16(compressedAxis), 0)
+}
+func SparseMatrixIndexCSXAddIndptrType(builder *flatbuffers.Builder, indptrType flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(indptrType), 0)
+}
+func SparseMatrixIndexCSXAddIndptrBuffer(builder *flatbuffers.Builder, indptrBuffer flatbuffers.UOffsetT) {
+	builder.PrependStructSlot(2, flatbuffers.UOffsetT(indptrBuffer), 0)
+}
+func SparseMatrixIndexCSXAddIndicesType(builder *flatbuffers.Builder, indicesType flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(indicesType), 0)
+}
+func SparseMatrixIndexCSXAddIndicesBuffer(builder *flatbuffers.Builder, indicesBuffer flatbuffers.UOffsetT) {
+	builder.PrependStructSlot(4, flatbuffers.UOffsetT(indicesBuffer), 0)
+}
+func SparseMatrixIndexCSXEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/go/arrow/internal/flatbuf/SparseTensor.go b/go/arrow/internal/flatbuf/SparseTensor.go
index 389832b95a1..6f3f55797d7 100644
--- a/go/arrow/internal/flatbuf/SparseTensor.go
+++ b/go/arrow/internal/flatbuf/SparseTensor.go
@@ -42,16 +42,16 @@ func (rcv *SparseTensor) Table() flatbuffers.Table {
 	return rcv._tab
 }
 
-func (rcv *SparseTensor) TypeType() byte {
+func (rcv *SparseTensor) TypeType() Type {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetByte(o + rcv._tab.Pos)
+		return Type(rcv._tab.GetByte(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
-func (rcv *SparseTensor) MutateTypeType(n byte) bool {
-	return rcv._tab.MutateByteSlot(4, n)
+func (rcv *SparseTensor) MutateTypeType(n Type) bool {
+	return rcv._tab.MutateByteSlot(4, byte(n))
 }
 
 /// The type of data contained in a value cell.
@@ -105,16 +105,16 @@ func (rcv *SparseTensor) MutateNonZeroLength(n int64) bool {
 	return rcv._tab.MutateInt64Slot(10, n)
 }
 
-func (rcv *SparseTensor) SparseIndexType() byte {
+func (rcv *SparseTensor) SparseIndexType() SparseTensorIndex {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
 	if o != 0 {
-		return rcv._tab.GetByte(o + rcv._tab.Pos)
+		return SparseTensorIndex(rcv._tab.GetByte(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
-func (rcv *SparseTensor) MutateSparseIndexType(n byte) bool {
-	return rcv._tab.MutateByteSlot(12, n)
+func (rcv *SparseTensor) MutateSparseIndexType(n SparseTensorIndex) bool {
+	return rcv._tab.MutateByteSlot(12, byte(n))
 }
 
 /// Sparse tensor index
@@ -146,8 +146,8 @@ func (rcv *SparseTensor) Data(obj *Buffer) *Buffer {
 func SparseTensorStart(builder *flatbuffers.Builder) {
 	builder.StartObject(7)
 }
-func SparseTensorAddTypeType(builder *flatbuffers.Builder, typeType byte) {
-	builder.PrependByteSlot(0, typeType, 0)
+func SparseTensorAddTypeType(builder *flatbuffers.Builder, typeType Type) {
+	builder.PrependByteSlot(0, byte(typeType), 0)
 }
 func SparseTensorAddType(builder *flatbuffers.Builder, type_ flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(type_), 0)
@@ -161,8 +161,8 @@ func SparseTensorStartShapeVector(builder *flatbuffers.Builder, numElems int) fl
 func SparseTensorAddNonZeroLength(builder *flatbuffers.Builder, nonZeroLength int64) {
 	builder.PrependInt64Slot(3, nonZeroLength, 0)
 }
-func SparseTensorAddSparseIndexType(builder *flatbuffers.Builder, sparseIndexType byte) {
-	builder.PrependByteSlot(4, sparseIndexType, 0)
+func SparseTensorAddSparseIndexType(builder *flatbuffers.Builder, sparseIndexType SparseTensorIndex) {
+	builder.PrependByteSlot(4, byte(sparseIndexType), 0)
 }
 func SparseTensorAddSparseIndex(builder *flatbuffers.Builder, sparseIndex flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(5, flatbuffers.UOffsetT(sparseIndex), 0)
diff --git a/go/arrow/internal/flatbuf/SparseTensorIndex.go b/go/arrow/internal/flatbuf/SparseTensorIndex.go
index 4b82de7efec..42aa818b0b3 100644
--- a/go/arrow/internal/flatbuf/SparseTensorIndex.go
+++ b/go/arrow/internal/flatbuf/SparseTensorIndex.go
@@ -18,16 +18,34 @@
 
 package flatbuf
 
-type SparseTensorIndex = byte
+import "strconv"
+
+type SparseTensorIndex byte
+
 const (
-	SparseTensorIndexNONE SparseTensorIndex = 0
+	SparseTensorIndexNONE                 SparseTensorIndex = 0
 	SparseTensorIndexSparseTensorIndexCOO SparseTensorIndex = 1
-	SparseTensorIndexSparseMatrixIndexCSR SparseTensorIndex = 2
+	SparseTensorIndexSparseMatrixIndexCSX SparseTensorIndex = 2
+	SparseTensorIndexSparseTensorIndexCSF SparseTensorIndex = 3
 )
 
 var EnumNamesSparseTensorIndex = map[SparseTensorIndex]string{
-	SparseTensorIndexNONE:"NONE",
-	SparseTensorIndexSparseTensorIndexCOO:"SparseTensorIndexCOO",
-	SparseTensorIndexSparseMatrixIndexCSR:"SparseMatrixIndexCSR",
+	SparseTensorIndexNONE:                 "NONE",
+	SparseTensorIndexSparseTensorIndexCOO: "SparseTensorIndexCOO",
+	SparseTensorIndexSparseMatrixIndexCSX: "SparseMatrixIndexCSX",
+	SparseTensorIndexSparseTensorIndexCSF: "SparseTensorIndexCSF",
 }
 
+var EnumValuesSparseTensorIndex = map[string]SparseTensorIndex{
+	"NONE":                 SparseTensorIndexNONE,
+	"SparseTensorIndexCOO": SparseTensorIndexSparseTensorIndexCOO,
+	"SparseMatrixIndexCSX": SparseTensorIndexSparseMatrixIndexCSX,
+	"SparseTensorIndexCSF": SparseTensorIndexSparseTensorIndexCSF,
+}
+
+func (v SparseTensorIndex) String() string {
+	if s, ok := EnumNamesSparseTensorIndex[v]; ok {
+		return s
+	}
+	return "SparseTensorIndex(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go b/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go
index 77c8dee902f..f8eee99fa69 100644
--- a/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go
+++ b/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go
@@ -24,7 +24,7 @@ import (
 
 /// ----------------------------------------------------------------------
 /// EXPERIMENTAL: Data structures for sparse tensors
-/// Coodinate (COO) format of sparse tensor index.
+/// Coordinate (COO) format of sparse tensor index.
 ///
 /// COO's index list are represented as a NxM matrix,
 /// where N is the number of non-zero values,
@@ -36,22 +36,24 @@ import (
 ///
 /// For example, let X be a 2x3x4x5 tensor, and it has the following
 /// 6 non-zero values:
-///
+/// ```text
 ///   X[0, 1, 2, 0] := 1
 ///   X[1, 1, 2, 3] := 2
 ///   X[0, 2, 1, 0] := 3
 ///   X[0, 1, 3, 0] := 4
 ///   X[0, 1, 2, 1] := 5
 ///   X[1, 2, 0, 4] := 6
-///
+/// ```
 /// In COO format, the index matrix of X is the following 4x6 matrix:
-///
+/// ```text
 ///   [[0, 0, 0, 0, 1, 1],
 ///    [1, 1, 1, 2, 1, 2],
 ///    [2, 2, 3, 1, 2, 0],
 ///    [0, 1, 0, 0, 3, 4]]
-///
-/// Note that the indices are sorted in lexicographical order.
+/// ```
+/// When isCanonical is true, the indices is sorted in lexicographical order
+/// (row-major order), and it does not have duplicated entries.  Otherwise,
+/// the indices may not be sorted, or may have duplicated entries.
 type SparseTensorIndexCOO struct {
 	_tab flatbuffers.Table
 }
@@ -88,6 +90,7 @@ func (rcv *SparseTensorIndexCOO) IndicesType(obj *Int) *Int {
 
 /// The type of values in indicesBuffer
 /// Non-negative byte offsets to advance one value cell along each dimension
+/// If omitted, default to row-major order (C-like).
 func (rcv *SparseTensorIndexCOO) IndicesStrides(j int) int64 {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
 	if o != 0 {
@@ -106,6 +109,7 @@ func (rcv *SparseTensorIndexCOO) IndicesStridesLength() int {
 }
 
 /// Non-negative byte offsets to advance one value cell along each dimension
+/// If omitted, default to row-major order (C-like).
 func (rcv *SparseTensorIndexCOO) MutateIndicesStrides(j int, n int64) bool {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
 	if o != 0 {
@@ -130,8 +134,30 @@ func (rcv *SparseTensorIndexCOO) IndicesBuffer(obj *Buffer) *Buffer {
 }
 
 /// The location and size of the indices matrix's data
+/// This flag is true if and only if the indices matrix is sorted in
+/// row-major order, and does not have duplicated entries.
+/// This sort order is the same as of Tensorflow's SparseTensor,
+/// but it is inverse order of SciPy's canonical coo_matrix
+/// (SciPy employs column-major order for its coo_matrix).
+func (rcv *SparseTensorIndexCOO) IsCanonical() bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		return rcv._tab.GetBool(o + rcv._tab.Pos)
+	}
+	return false
+}
+
+/// This flag is true if and only if the indices matrix is sorted in
+/// row-major order, and does not have duplicated entries.
+/// This sort order is the same as of Tensorflow's SparseTensor,
+/// but it is inverse order of SciPy's canonical coo_matrix
+/// (SciPy employs column-major order for its coo_matrix).
+func (rcv *SparseTensorIndexCOO) MutateIsCanonical(n bool) bool {
+	return rcv._tab.MutateBoolSlot(10, n)
+}
+
 func SparseTensorIndexCOOStart(builder *flatbuffers.Builder) {
-	builder.StartObject(3)
+	builder.StartObject(4)
 }
 func SparseTensorIndexCOOAddIndicesType(builder *flatbuffers.Builder, indicesType flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(indicesType), 0)
@@ -145,6 +171,9 @@ func SparseTensorIndexCOOStartIndicesStridesVector(builder *flatbuffers.Builder,
 func SparseTensorIndexCOOAddIndicesBuffer(builder *flatbuffers.Builder, indicesBuffer flatbuffers.UOffsetT) {
 	builder.PrependStructSlot(2, flatbuffers.UOffsetT(indicesBuffer), 0)
 }
+func SparseTensorIndexCOOAddIsCanonical(builder *flatbuffers.Builder, isCanonical bool) {
+	builder.PrependBoolSlot(3, isCanonical, false)
+}
 func SparseTensorIndexCOOEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
 	return builder.EndObject()
 }
diff --git a/go/arrow/internal/flatbuf/SparseTensorIndexCSF.go b/go/arrow/internal/flatbuf/SparseTensorIndexCSF.go
new file mode 100644
index 00000000000..a824c84ebfe
--- /dev/null
+++ b/go/arrow/internal/flatbuf/SparseTensorIndexCSF.go
@@ -0,0 +1,291 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package flatbuf
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+/// Compressed Sparse Fiber (CSF) sparse tensor index.
+type SparseTensorIndexCSF struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsSparseTensorIndexCSF(buf []byte, offset flatbuffers.UOffsetT) *SparseTensorIndexCSF {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &SparseTensorIndexCSF{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func (rcv *SparseTensorIndexCSF) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *SparseTensorIndexCSF) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+/// CSF is a generalization of compressed sparse row (CSR) index.
+/// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf)
+///
+/// CSF index recursively compresses each dimension of a tensor into a set
+/// of prefix trees. Each path from a root to leaf forms one tensor
+/// non-zero index. CSF is implemented with two arrays of buffers and one
+/// arrays of integers.
+///
+/// For example, let X be a 2x3x4x5 tensor and let it have the following
+/// 8 non-zero values:
+/// ```text
+///   X[0, 0, 0, 1] := 1
+///   X[0, 0, 0, 2] := 2
+///   X[0, 1, 0, 0] := 3
+///   X[0, 1, 0, 2] := 4
+///   X[0, 1, 1, 0] := 5
+///   X[1, 1, 1, 0] := 6
+///   X[1, 1, 1, 1] := 7
+///   X[1, 1, 1, 2] := 8
+/// ```
+/// As a prefix tree this would be represented as:
+/// ```text
+///         0          1
+///        / \         |
+///       0   1        1
+///      /   / \       |
+///     0   0   1      1
+///    /|  /|   |    /| |
+///   1 2 0 2   0   0 1 2
+/// ```
+/// The type of values in indptrBuffers
+func (rcv *SparseTensorIndexCSF) IndptrType(obj *Int) *Int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(Int)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+/// CSF is a generalization of compressed sparse row (CSR) index.
+/// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf)
+///
+/// CSF index recursively compresses each dimension of a tensor into a set
+/// of prefix trees. Each path from a root to leaf forms one tensor
+/// non-zero index. CSF is implemented with two arrays of buffers and one
+/// arrays of integers.
+///
+/// For example, let X be a 2x3x4x5 tensor and let it have the following
+/// 8 non-zero values:
+/// ```text
+///   X[0, 0, 0, 1] := 1
+///   X[0, 0, 0, 2] := 2
+///   X[0, 1, 0, 0] := 3
+///   X[0, 1, 0, 2] := 4
+///   X[0, 1, 1, 0] := 5
+///   X[1, 1, 1, 0] := 6
+///   X[1, 1, 1, 1] := 7
+///   X[1, 1, 1, 2] := 8
+/// ```
+/// As a prefix tree this would be represented as:
+/// ```text
+///         0          1
+///        / \         |
+///       0   1        1
+///      /   / \       |
+///     0   0   1      1
+///    /|  /|   |    /| |
+///   1 2 0 2   0   0 1 2
+/// ```
+/// The type of values in indptrBuffers
+/// indptrBuffers stores the sparsity structure.
+/// Each two consecutive dimensions in a tensor correspond to a buffer in
+/// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]`
+/// and `indptrBuffers[dim][i + 1]` signify a range of nodes in
+/// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node.
+///
+/// For example, the indptrBuffers for the above X is:
+/// ```text
+///   indptrBuffer(X) = [
+///                       [0, 2, 3],
+///                       [0, 1, 3, 4],
+///                       [0, 2, 4, 5, 8]
+///                     ].
+/// ```
+func (rcv *SparseTensorIndexCSF) IndptrBuffers(obj *Buffer, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 16
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *SparseTensorIndexCSF) IndptrBuffersLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// indptrBuffers stores the sparsity structure.
+/// Each two consecutive dimensions in a tensor correspond to a buffer in
+/// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]`
+/// and `indptrBuffers[dim][i + 1]` signify a range of nodes in
+/// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node.
+///
+/// For example, the indptrBuffers for the above X is:
+/// ```text
+///   indptrBuffer(X) = [
+///                       [0, 2, 3],
+///                       [0, 1, 3, 4],
+///                       [0, 2, 4, 5, 8]
+///                     ].
+/// ```
+/// The type of values in indicesBuffers
+func (rcv *SparseTensorIndexCSF) IndicesType(obj *Int) *Int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(Int)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+/// The type of values in indicesBuffers
+/// indicesBuffers stores values of nodes.
+/// Each tensor dimension corresponds to a buffer in indicesBuffers.
+/// For example, the indicesBuffers for the above X is:
+/// ```text
+///   indicesBuffer(X) = [
+///                        [0, 1],
+///                        [0, 1, 1],
+///                        [0, 0, 1, 1],
+///                        [1, 2, 0, 2, 0, 0, 1, 2]
+///                      ].
+/// ```
+func (rcv *SparseTensorIndexCSF) IndicesBuffers(obj *Buffer, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 16
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *SparseTensorIndexCSF) IndicesBuffersLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// indicesBuffers stores values of nodes.
+/// Each tensor dimension corresponds to a buffer in indicesBuffers.
+/// For example, the indicesBuffers for the above X is:
+/// ```text
+///   indicesBuffer(X) = [
+///                        [0, 1],
+///                        [0, 1, 1],
+///                        [0, 0, 1, 1],
+///                        [1, 2, 0, 2, 0, 0, 1, 2]
+///                      ].
+/// ```
+/// axisOrder stores the sequence in which dimensions were traversed to
+/// produce the prefix tree.
+/// For example, the axisOrder for the above X is:
+/// ```text
+///   axisOrder(X) = [0, 1, 2, 3].
+/// ```
+func (rcv *SparseTensorIndexCSF) AxisOrder(j int) int32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetInt32(a + flatbuffers.UOffsetT(j*4))
+	}
+	return 0
+}
+
+func (rcv *SparseTensorIndexCSF) AxisOrderLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// axisOrder stores the sequence in which dimensions were traversed to
+/// produce the prefix tree.
+/// For example, the axisOrder for the above X is:
+/// ```text
+///   axisOrder(X) = [0, 1, 2, 3].
+/// ```
+func (rcv *SparseTensorIndexCSF) MutateAxisOrder(j int, n int32) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateInt32(a+flatbuffers.UOffsetT(j*4), n)
+	}
+	return false
+}
+
+func SparseTensorIndexCSFStart(builder *flatbuffers.Builder) {
+	builder.StartObject(5)
+}
+func SparseTensorIndexCSFAddIndptrType(builder *flatbuffers.Builder, indptrType flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(indptrType), 0)
+}
+func SparseTensorIndexCSFAddIndptrBuffers(builder *flatbuffers.Builder, indptrBuffers flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(indptrBuffers), 0)
+}
+func SparseTensorIndexCSFStartIndptrBuffersVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(16, numElems, 8)
+}
+func SparseTensorIndexCSFAddIndicesType(builder *flatbuffers.Builder, indicesType flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(2, flatbuffers.UOffsetT(indicesType), 0)
+}
+func SparseTensorIndexCSFAddIndicesBuffers(builder *flatbuffers.Builder, indicesBuffers flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(indicesBuffers), 0)
+}
+func SparseTensorIndexCSFStartIndicesBuffersVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(16, numElems, 8)
+}
+func SparseTensorIndexCSFAddAxisOrder(builder *flatbuffers.Builder, axisOrder flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(4, flatbuffers.UOffsetT(axisOrder), 0)
+}
+func SparseTensorIndexCSFStartAxisOrderVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
+func SparseTensorIndexCSFEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/go/arrow/internal/flatbuf/Tensor.go b/go/arrow/internal/flatbuf/Tensor.go
index f8341e02c0b..39d70e351e3 100644
--- a/go/arrow/internal/flatbuf/Tensor.go
+++ b/go/arrow/internal/flatbuf/Tensor.go
@@ -42,16 +42,16 @@ func (rcv *Tensor) Table() flatbuffers.Table {
 	return rcv._tab
 }
 
-func (rcv *Tensor) TypeType() byte {
+func (rcv *Tensor) TypeType() Type {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetByte(o + rcv._tab.Pos)
+		return Type(rcv._tab.GetByte(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
-func (rcv *Tensor) MutateTypeType(n byte) bool {
-	return rcv._tab.MutateByteSlot(4, n)
+func (rcv *Tensor) MutateTypeType(n Type) bool {
+	return rcv._tab.MutateByteSlot(4, byte(n))
 }
 
 /// The type of data contained in a value cell. Currently only fixed-width
@@ -90,6 +90,7 @@ func (rcv *Tensor) ShapeLength() int {
 
 /// The dimensions of the tensor, optionally named
 /// Non-negative byte offsets to advance one value cell along each dimension
+/// If omitted, default to row-major order (C-like).
 func (rcv *Tensor) Strides(j int) int64 {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
 	if o != 0 {
@@ -108,6 +109,7 @@ func (rcv *Tensor) StridesLength() int {
 }
 
 /// Non-negative byte offsets to advance one value cell along each dimension
+/// If omitted, default to row-major order (C-like).
 func (rcv *Tensor) MutateStrides(j int, n int64) bool {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
 	if o != 0 {
@@ -135,8 +137,8 @@ func (rcv *Tensor) Data(obj *Buffer) *Buffer {
 func TensorStart(builder *flatbuffers.Builder) {
 	builder.StartObject(5)
 }
-func TensorAddTypeType(builder *flatbuffers.Builder, typeType byte) {
-	builder.PrependByteSlot(0, typeType, 0)
+func TensorAddTypeType(builder *flatbuffers.Builder, typeType Type) {
+	builder.PrependByteSlot(0, byte(typeType), 0)
 }
 func TensorAddType(builder *flatbuffers.Builder, type_ flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(type_), 0)
diff --git a/go/arrow/internal/flatbuf/Time.go b/go/arrow/internal/flatbuf/Time.go
index f3fa99f07b8..07b80eeb25a 100644
--- a/go/arrow/internal/flatbuf/Time.go
+++ b/go/arrow/internal/flatbuf/Time.go
@@ -48,13 +48,13 @@ func (rcv *Time) Table() flatbuffers.Table {
 func (rcv *Time) Unit() TimeUnit {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return TimeUnit(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 1
 }
 
 func (rcv *Time) MutateUnit(n TimeUnit) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func (rcv *Time) BitWidth() int32 {
@@ -72,8 +72,8 @@ func (rcv *Time) MutateBitWidth(n int32) bool {
 func TimeStart(builder *flatbuffers.Builder) {
 	builder.StartObject(2)
 }
-func TimeAddUnit(builder *flatbuffers.Builder, unit int16) {
-	builder.PrependInt16Slot(0, unit, 1)
+func TimeAddUnit(builder *flatbuffers.Builder, unit TimeUnit) {
+	builder.PrependInt16Slot(0, int16(unit), 1)
 }
 func TimeAddBitWidth(builder *flatbuffers.Builder, bitWidth int32) {
 	builder.PrependInt32Slot(1, bitWidth, 32)
diff --git a/go/arrow/internal/flatbuf/TimeUnit.go b/go/arrow/internal/flatbuf/TimeUnit.go
index cb0c7427d6c..df14ece4f18 100644
--- a/go/arrow/internal/flatbuf/TimeUnit.go
+++ b/go/arrow/internal/flatbuf/TimeUnit.go
@@ -18,18 +18,34 @@
 
 package flatbuf
 
-type TimeUnit = int16
+import "strconv"
+
+type TimeUnit int16
+
 const (
-	TimeUnitSECOND TimeUnit = 0
+	TimeUnitSECOND      TimeUnit = 0
 	TimeUnitMILLISECOND TimeUnit = 1
 	TimeUnitMICROSECOND TimeUnit = 2
-	TimeUnitNANOSECOND TimeUnit = 3
+	TimeUnitNANOSECOND  TimeUnit = 3
 )
 
 var EnumNamesTimeUnit = map[TimeUnit]string{
-	TimeUnitSECOND:"SECOND",
-	TimeUnitMILLISECOND:"MILLISECOND",
-	TimeUnitMICROSECOND:"MICROSECOND",
-	TimeUnitNANOSECOND:"NANOSECOND",
+	TimeUnitSECOND:      "SECOND",
+	TimeUnitMILLISECOND: "MILLISECOND",
+	TimeUnitMICROSECOND: "MICROSECOND",
+	TimeUnitNANOSECOND:  "NANOSECOND",
 }
 
+var EnumValuesTimeUnit = map[string]TimeUnit{
+	"SECOND":      TimeUnitSECOND,
+	"MILLISECOND": TimeUnitMILLISECOND,
+	"MICROSECOND": TimeUnitMICROSECOND,
+	"NANOSECOND":  TimeUnitNANOSECOND,
+}
+
+func (v TimeUnit) String() string {
+	if s, ok := EnumNamesTimeUnit[v]; ok {
+		return s
+	}
+	return "TimeUnit(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Timestamp.go b/go/arrow/internal/flatbuf/Timestamp.go
index dd98c8226ec..44f944000b8 100644
--- a/go/arrow/internal/flatbuf/Timestamp.go
+++ b/go/arrow/internal/flatbuf/Timestamp.go
@@ -51,13 +51,13 @@ func (rcv *Timestamp) Table() flatbuffers.Table {
 func (rcv *Timestamp) Unit() TimeUnit {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return TimeUnit(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
 func (rcv *Timestamp) MutateUnit(n TimeUnit) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 /// The time zone is a string indicating the name of a time zone, one of:
@@ -111,8 +111,8 @@ func (rcv *Timestamp) Timezone() []byte {
 func TimestampStart(builder *flatbuffers.Builder) {
 	builder.StartObject(2)
 }
-func TimestampAddUnit(builder *flatbuffers.Builder, unit int16) {
-	builder.PrependInt16Slot(0, unit, 0)
+func TimestampAddUnit(builder *flatbuffers.Builder, unit TimeUnit) {
+	builder.PrependInt16Slot(0, int16(unit), 0)
 }
 func TimestampAddTimezone(builder *flatbuffers.Builder, timezone flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(timezone), 0)
diff --git a/go/arrow/internal/flatbuf/Type.go b/go/arrow/internal/flatbuf/Type.go
index e3fbe2bb110..319c6c6a4a5 100644
--- a/go/arrow/internal/flatbuf/Type.go
+++ b/go/arrow/internal/flatbuf/Type.go
@@ -18,57 +18,91 @@
 
 package flatbuf
 
+import "strconv"
+
 /// ----------------------------------------------------------------------
 /// Top-level Type value, enabling extensible type-specific metadata. We can
 /// add new logical types to Type without breaking backwards compatibility
-type Type = byte
+type Type byte
+
 const (
-	TypeNONE Type = 0
-	TypeNull Type = 1
-	TypeInt Type = 2
-	TypeFloatingPoint Type = 3
-	TypeBinary Type = 4
-	TypeUtf8 Type = 5
-	TypeBool Type = 6
-	TypeDecimal Type = 7
-	TypeDate Type = 8
-	TypeTime Type = 9
-	TypeTimestamp Type = 10
-	TypeInterval Type = 11
-	TypeList Type = 12
-	TypeStruct_ Type = 13
-	TypeUnion Type = 14
+	TypeNONE            Type = 0
+	TypeNull            Type = 1
+	TypeInt             Type = 2
+	TypeFloatingPoint   Type = 3
+	TypeBinary          Type = 4
+	TypeUtf8            Type = 5
+	TypeBool            Type = 6
+	TypeDecimal         Type = 7
+	TypeDate            Type = 8
+	TypeTime            Type = 9
+	TypeTimestamp       Type = 10
+	TypeInterval        Type = 11
+	TypeList            Type = 12
+	TypeStruct_         Type = 13
+	TypeUnion           Type = 14
 	TypeFixedSizeBinary Type = 15
-	TypeFixedSizeList Type = 16
-	TypeMap Type = 17
-	TypeDuration Type = 18
-	TypeLargeBinary Type = 19
-	TypeLargeUtf8 Type = 20
-	TypeLargeList Type = 21
+	TypeFixedSizeList   Type = 16
+	TypeMap             Type = 17
+	TypeDuration        Type = 18
+	TypeLargeBinary     Type = 19
+	TypeLargeUtf8       Type = 20
+	TypeLargeList       Type = 21
 )
 
 var EnumNamesType = map[Type]string{
-	TypeNONE:"NONE",
-	TypeNull:"Null",
-	TypeInt:"Int",
-	TypeFloatingPoint:"FloatingPoint",
-	TypeBinary:"Binary",
-	TypeUtf8:"Utf8",
-	TypeBool:"Bool",
-	TypeDecimal:"Decimal",
-	TypeDate:"Date",
-	TypeTime:"Time",
-	TypeTimestamp:"Timestamp",
-	TypeInterval:"Interval",
-	TypeList:"List",
-	TypeStruct_:"Struct_",
-	TypeUnion:"Union",
-	TypeFixedSizeBinary:"FixedSizeBinary",
-	TypeFixedSizeList:"FixedSizeList",
-	TypeMap:"Map",
-	TypeDuration:"Duration",
-	TypeLargeBinary:"LargeBinary",
-	TypeLargeUtf8:"LargeUtf8",
-	TypeLargeList:"LargeList",
+	TypeNONE:            "NONE",
+	TypeNull:            "Null",
+	TypeInt:             "Int",
+	TypeFloatingPoint:   "FloatingPoint",
+	TypeBinary:          "Binary",
+	TypeUtf8:            "Utf8",
+	TypeBool:            "Bool",
+	TypeDecimal:         "Decimal",
+	TypeDate:            "Date",
+	TypeTime:            "Time",
+	TypeTimestamp:       "Timestamp",
+	TypeInterval:        "Interval",
+	TypeList:            "List",
+	TypeStruct_:         "Struct_",
+	TypeUnion:           "Union",
+	TypeFixedSizeBinary: "FixedSizeBinary",
+	TypeFixedSizeList:   "FixedSizeList",
+	TypeMap:             "Map",
+	TypeDuration:        "Duration",
+	TypeLargeBinary:     "LargeBinary",
+	TypeLargeUtf8:       "LargeUtf8",
+	TypeLargeList:       "LargeList",
 }
 
+var EnumValuesType = map[string]Type{
+	"NONE":            TypeNONE,
+	"Null":            TypeNull,
+	"Int":             TypeInt,
+	"FloatingPoint":   TypeFloatingPoint,
+	"Binary":          TypeBinary,
+	"Utf8":            TypeUtf8,
+	"Bool":            TypeBool,
+	"Decimal":         TypeDecimal,
+	"Date":            TypeDate,
+	"Time":            TypeTime,
+	"Timestamp":       TypeTimestamp,
+	"Interval":        TypeInterval,
+	"List":            TypeList,
+	"Struct_":         TypeStruct_,
+	"Union":           TypeUnion,
+	"FixedSizeBinary": TypeFixedSizeBinary,
+	"FixedSizeList":   TypeFixedSizeList,
+	"Map":             TypeMap,
+	"Duration":        TypeDuration,
+	"LargeBinary":     TypeLargeBinary,
+	"LargeUtf8":       TypeLargeUtf8,
+	"LargeList":       TypeLargeList,
+}
+
+func (v Type) String() string {
+	if s, ok := EnumNamesType[v]; ok {
+		return s
+	}
+	return "Type(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/internal/flatbuf/Union.go b/go/arrow/internal/flatbuf/Union.go
index 1eae176e630..e34121d4757 100644
--- a/go/arrow/internal/flatbuf/Union.go
+++ b/go/arrow/internal/flatbuf/Union.go
@@ -25,7 +25,7 @@ import (
 /// A union is a complex type with children in Field
 /// By default ids in the type vector refer to the offsets in the children
 /// optionally typeIds provides an indirection between the child offset and the type id
-/// for each child typeIds[offset] is the id used in the type vector
+/// for each child `typeIds[offset]` is the id used in the type vector
 type Union struct {
 	_tab flatbuffers.Table
 }
@@ -49,13 +49,13 @@ func (rcv *Union) Table() flatbuffers.Table {
 func (rcv *Union) Mode() UnionMode {
 	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
 	if o != 0 {
-		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+		return UnionMode(rcv._tab.GetInt16(o + rcv._tab.Pos))
 	}
 	return 0
 }
 
 func (rcv *Union) MutateMode(n UnionMode) bool {
-	return rcv._tab.MutateInt16Slot(4, n)
+	return rcv._tab.MutateInt16Slot(4, int16(n))
 }
 
 func (rcv *Union) TypeIds(j int) int32 {
@@ -87,8 +87,8 @@ func (rcv *Union) MutateTypeIds(j int, n int32) bool {
 func UnionStart(builder *flatbuffers.Builder) {
 	builder.StartObject(2)
 }
-func UnionAddMode(builder *flatbuffers.Builder, mode int16) {
-	builder.PrependInt16Slot(0, mode, 0)
+func UnionAddMode(builder *flatbuffers.Builder, mode UnionMode) {
+	builder.PrependInt16Slot(0, int16(mode), 0)
 }
 func UnionAddTypeIds(builder *flatbuffers.Builder, typeIds flatbuffers.UOffsetT) {
 	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(typeIds), 0)
diff --git a/go/arrow/internal/flatbuf/UnionMode.go b/go/arrow/internal/flatbuf/UnionMode.go
index 4357682ad4d..357c1f3cb5f 100644
--- a/go/arrow/internal/flatbuf/UnionMode.go
+++ b/go/arrow/internal/flatbuf/UnionMode.go
@@ -18,14 +18,28 @@
 
 package flatbuf
 
-type UnionMode = int16
+import "strconv"
+
+type UnionMode int16
+
 const (
 	UnionModeSparse UnionMode = 0
-	UnionModeDense UnionMode = 1
+	UnionModeDense  UnionMode = 1
 )
 
 var EnumNamesUnionMode = map[UnionMode]string{
-	UnionModeSparse:"Sparse",
-	UnionModeDense:"Dense",
+	UnionModeSparse: "Sparse",
+	UnionModeDense:  "Dense",
 }
 
+var EnumValuesUnionMode = map[string]UnionMode{
+	"Sparse": UnionModeSparse,
+	"Dense":  UnionModeDense,
+}
+
+func (v UnionMode) String() string {
+	if s, ok := EnumNamesUnionMode[v]; ok {
+		return s
+	}
+	return "UnionMode(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/go/arrow/ipc/file_writer.go b/go/arrow/ipc/file_writer.go
index a2570c362cc..112427020f4 100644
--- a/go/arrow/ipc/file_writer.go
+++ b/go/arrow/ipc/file_writer.go
@@ -82,7 +82,7 @@ func (w *pwriter) WritePayload(p Payload) error {
 		return xerrors.Errorf("arrow/ipc: could not update position while in write-payload: %w", err)
 	}
 
-	switch byte(p.msg) {
+	switch flatbuf.MessageHeader(p.msg) {
 	case flatbuf.MessageHeaderDictionaryBatch:
 		w.dicts = append(w.dicts, blk)
 	case flatbuf.MessageHeaderRecordBatch:
diff --git a/go/arrow/ipc/message.go b/go/arrow/ipc/message.go
index 468003c4f90..76f81e4fa4a 100644
--- a/go/arrow/ipc/message.go
+++ b/go/arrow/ipc/message.go
@@ -39,7 +39,7 @@ const (
 )
 
 func (m MetadataVersion) String() string {
-	if v, ok := flatbuf.EnumNamesMetadataVersion[int16(m)]; ok {
+	if v, ok := flatbuf.EnumNamesMetadataVersion[flatbuf.MetadataVersion(m)]; ok {
 		return v
 	}
 	return fmt.Sprintf("MetadataVersion(%d)", int16(m))
@@ -58,7 +58,7 @@ const (
 )
 
 func (m MessageType) String() string {
-	if v, ok := flatbuf.EnumNamesMessageHeader[byte(m)]; ok {
+	if v, ok := flatbuf.EnumNamesMessageHeader[flatbuf.MessageHeader(m)]; ok {
 		return v
 	}
 	return fmt.Sprintf("MessageType(%d)", int(m))
diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go
index 687cbe0a0ec..ef9a9bbe457 100644
--- a/go/arrow/ipc/metadata.go
+++ b/go/arrow/ipc/metadata.go
@@ -927,7 +927,7 @@ func writeFBBuilder(b *flatbuffers.Builder, mem memory.Allocator) *memory.Buffer
 func writeMessageFB(b *flatbuffers.Builder, mem memory.Allocator, hdrType flatbuf.MessageHeader, hdr flatbuffers.UOffsetT, bodyLen int64) *memory.Buffer {
 
 	flatbuf.MessageStart(b)
-	flatbuf.MessageAddVersion(b, int16(currentMetadataVersion))
+	flatbuf.MessageAddVersion(b, flatbuf.MetadataVersion(currentMetadataVersion))
 	flatbuf.MessageAddHeaderType(b, hdrType)
 	flatbuf.MessageAddHeader(b, hdr)
 	flatbuf.MessageAddBodyLength(b, bodyLen)
@@ -954,7 +954,7 @@ func writeFileFooter(schema *arrow.Schema, dicts, recs []fileBlock, w io.Writer)
 	recsFB := fileBlocksToFB(b, recs, flatbuf.FooterStartRecordBatchesVector)
 
 	flatbuf.FooterStart(b)
-	flatbuf.FooterAddVersion(b, int16(currentMetadataVersion))
+	flatbuf.FooterAddVersion(b, flatbuf.MetadataVersion(currentMetadataVersion))
 	flatbuf.FooterAddSchema(b, schemaFB)
 	flatbuf.FooterAddDictionaries(b, dictsFB)
 	flatbuf.FooterAddRecordBatches(b, recsFB)
diff --git a/go/arrow/math/_lib/CMakeLists.txt b/go/arrow/math/_lib/CMakeLists.txt
index ec1558b25fb..050bd40804f 100644
--- a/go/arrow/math/_lib/CMakeLists.txt
+++ b/go/arrow/math/_lib/CMakeLists.txt
@@ -20,5 +20,3 @@ project(math-func)
 set(CMAKE_C_STANDARD 99)
 
 add_library(memory STATIC float64.c int64.c uint64.c)
-
-
diff --git a/go/arrow/memory/_lib/CMakeLists.txt b/go/arrow/memory/_lib/CMakeLists.txt
index f6815302de1..6126acd7c67 100644
--- a/go/arrow/memory/_lib/CMakeLists.txt
+++ b/go/arrow/memory/_lib/CMakeLists.txt
@@ -20,5 +20,3 @@ project(memory-func)
 set(CMAKE_C_STANDARD 99)
 
 add_library(memory STATIC memory.c)
-
-
diff --git a/go/parquet/.gitignore b/go/parquet/.gitignore
new file mode 100644
index 00000000000..4120c5119f0
--- /dev/null
+++ b/go/parquet/.gitignore
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, built with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+
+# Dependency directories (remove the comment below to include it)
+# vendor/
diff --git a/go/parquet/LICENSE.txt b/go/parquet/LICENSE.txt
new file mode 100644
index 00000000000..6884e08f455
--- /dev/null
+++ b/go/parquet/LICENSE.txt
@@ -0,0 +1,1987 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+src/plasma/fling.cc and src/plasma/fling.h: Apache 2.0
+
+Copyright 2013 Sharvil Nanavati
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+src/plasma/thirdparty/ae: Modified / 3-Clause BSD
+
+Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of Redis nor the names of its contributors may be used
+   to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+src/plasma/thirdparty/dlmalloc.c: CC0
+
+This is a version (aka dlmalloc) of malloc/free/realloc written by
+Doug Lea and released to the public domain, as explained at
+http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
+comments, complaints, performance data, etc to dl@cs.oswego.edu
+
+--------------------------------------------------------------------------------
+
+src/plasma/common.cc (some portions)
+
+Copyright (c) Austin Appleby (aappleby (AT) gmail)
+
+Some portions of this file are derived from code in the MurmurHash project
+
+All code is released to the public domain. For business purposes, Murmurhash is
+under the MIT license.
+
+https://sites.google.com/site/murmurhash/
+
+--------------------------------------------------------------------------------
+
+src/arrow/util (some portions): Apache 2.0, and 3-clause BSD
+
+Some portions of this module are derived from code in the Chromium project,
+copyright (c) Google inc and (c) The Chromium Authors and licensed under the
+Apache 2.0 License or the under the 3-clause BSD license:
+
+  Copyright (c) 2013 The Chromium Authors. All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+     * Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above
+  copyright notice, this list of conditions and the following disclaimer
+  in the documentation and/or other materials provided with the
+  distribution.
+     * Neither the name of Google Inc. nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from Daniel Lemire's FrameOfReference project.
+
+https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
+
+Copyright: 2013 Daniel Lemire
+Home page: http://lemire.me/en/
+Project page: https://github.com/lemire/FrameOfReference
+License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from the TensorFlow project
+
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the NumPy project.
+
+https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910
+
+https://github.com/numpy/numpy/blob/68fd82271b9ea5a9e50d4e761061dfcca851382a/numpy/core/src/multiarray/datetime.c
+
+Copyright (c) 2005-2017, NumPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the NumPy Developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the Boost project
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the FlatBuffers project
+
+Copyright 2014 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the tslib project
+
+Copyright 2015 Microsoft Corporation. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the jemalloc project
+
+https://github.com/jemalloc/jemalloc
+
+Copyright (C) 2002-2017 Jason Evans <jasone@canonware.com>.
+All rights reserved.
+Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+Copyright (C) 2009-2017 Facebook, Inc.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice(s),
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice(s),
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--------------------------------------------------------------------------------
+
+This project includes code from the Go project, BSD 3-clause license + PATENTS
+weak patent termination clause
+(https://github.com/golang/go/blob/master/PATENTS).
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the hs2client
+
+https://github.com/cloudera/hs2client
+
+Copyright 2016 Cloudera Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+The script ci/scripts/util_wait_for_it.sh has the following license
+
+Copyright (c) 2016 Giles Hall
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The script r/configure has the following license (MIT)
+
+Copyright (c) 2017, Jeroen Ooms and Jim Hester
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+cpp/src/arrow/util/logging.cc, cpp/src/arrow/util/logging.h and
+cpp/src/arrow/util/logging-test.cc are adapted from
+Ray Project (https://github.com/ray-project/ray) (Apache 2.0).
+
+Copyright (c) 2016 Ray Project (https://github.com/ray-project/ray)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+The files cpp/src/arrow/vendored/datetime/date.h, cpp/src/arrow/vendored/datetime/tz.h,
+cpp/src/arrow/vendored/datetime/tz_private.h, cpp/src/arrow/vendored/datetime/ios.h,
+cpp/src/arrow/vendored/datetime/ios.mm,
+cpp/src/arrow/vendored/datetime/tz.cpp are adapted from
+Howard Hinnant's date library (https://github.com/HowardHinnant/date)
+It is licensed under MIT license.
+
+The MIT License (MIT)
+Copyright (c) 2015, 2016, 2017 Howard Hinnant
+Copyright (c) 2016 Adrian Colomitchi
+Copyright (c) 2017 Florian Dang
+Copyright (c) 2017 Paul Thompson
+Copyright (c) 2018 Tomasz Kamiński
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/util/utf8.h includes code adapted from the page
+  https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+with the following license (MIT)
+
+Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/vendored/string_view.hpp has the following license
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/xxhash/ have the following license
+(BSD 2-Clause License)
+
+xxHash Library
+Copyright (c) 2012-2014, Yann Collet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash homepage: http://www.xxhash.com
+- xxHash source repository : https://github.com/Cyan4973/xxHash
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/double-conversion/ have the following license
+(BSD 3-Clause License)
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/uriparser/ have the following license
+(BSD 3-Clause License)
+
+uriparser - RFC 3986 URI parsing library
+
+Copyright (C) 2007, Weijia Song <songweijia@gmail.com>
+Copyright (C) 2007, Sebastian Pipping <sebastian@pipping.org>
+All rights reserved.
+
+Redistribution  and use in source and binary forms, with or without
+modification,  are permitted provided that the following conditions
+are met:
+
+    * Redistributions   of  source  code  must  retain  the   above
+      copyright  notice, this list of conditions and the  following
+      disclaimer.
+
+    * Redistributions  in  binary  form must  reproduce  the  above
+      copyright  notice, this list of conditions and the  following
+      disclaimer   in  the  documentation  and/or  other  materials
+      provided with the distribution.
+
+    * Neither  the name of the <ORGANIZATION> nor the names of  its
+      contributors  may  be  used to endorse  or  promote  products
+      derived  from  this software without specific  prior  written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS  IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT  NOT
+LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND  FITNESS
+FOR  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT  SHALL  THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL,    SPECIAL,   EXEMPLARY,   OR   CONSEQUENTIAL   DAMAGES
+(INCLUDING,  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES;  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT  LIABILITY,  OR  TORT (INCLUDING  NEGLIGENCE  OR  OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+The files under dev/tasks/conda-recipes have the following license
+
+BSD 3-clause license
+Copyright (c) 2015-2018, conda-forge
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/utf8cpp/ have the following license
+
+Copyright 2006 Nemanja Trifunovic
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from Apache Kudu.
+
+ * cpp/cmake_modules/CompilerInfo.cmake is based on Kudu's cmake_modules/CompilerInfo.cmake
+
+Copyright: 2016 The Apache Software Foundation.
+Home page: https://kudu.apache.org/
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from Apache Impala (incubating), formerly
+Impala. The Impala code and rights were donated to the ASF as part of the
+Incubator process after the initial code imports into Apache Parquet.
+
+Copyright: 2012 Cloudera, Inc.
+Copyright: 2016 The Apache Software Foundation.
+Home page: http://impala.apache.org/
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from Apache Aurora.
+
+* dev/release/{release,changelog,release-candidate} are based on the scripts from
+  Apache Aurora
+
+Copyright: 2016 The Apache Software Foundation.
+Home page: https://aurora.apache.org/
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from the Google styleguide.
+
+* cpp/build-support/cpplint.py is based on the scripts from the Google styleguide.
+
+Copyright: 2009 Google Inc. All rights reserved.
+Homepage: https://github.com/google/styleguide
+License: 3-clause BSD
+
+--------------------------------------------------------------------------------
+
+This project includes code from Snappy.
+
+* cpp/cmake_modules/{SnappyCMakeLists.txt,SnappyConfig.h} are based on code
+  from Google's Snappy project.
+
+Copyright: 2009 Google Inc. All rights reserved.
+Homepage: https://github.com/google/snappy
+License: 3-clause BSD
+
+--------------------------------------------------------------------------------
+
+This project includes code from the manylinux project.
+
+* python/manylinux1/scripts/{build_python.sh,python-tag-abi-tag.py,
+  requirements.txt} are based on code from the manylinux project.
+
+Copyright: 2016 manylinux
+Homepage: https://github.com/pypa/manylinux
+License: The MIT License (MIT)
+
+--------------------------------------------------------------------------------
+
+This project includes code from the cymove project:
+
+* python/pyarrow/includes/common.pxd includes code from the cymove project
+
+The MIT License (MIT)
+Copyright (c) 2019 Omer Ozarslan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The projects includes code from the Ursabot project under the dev/archery
+directory.
+
+License: BSD 2-Clause
+
+Copyright 2019 RStudio, Inc.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project include code from CMake.
+
+* cpp/cmake_modules/FindGTest.cmake is based on code from CMake.
+
+Copyright: Copyright 2000-2019 Kitware, Inc. and Contributors
+Homepage: https://gitlab.kitware.com/cmake/cmake
+License: 3-clause BSD
+
+--------------------------------------------------------------------------------
+
+This project include code from mingw-w64.
+
+* cpp/src/arrow/util/cpu-info.cc has a polyfill for mingw-w64 < 5
+
+Copyright (c) 2009 - 2013 by the mingw-w64 project
+Homepage: https://mingw-w64.org
+License: Zope Public License (ZPL) Version 2.1.
+
+---------------------------------------------------------------------------------
+
+This project include code from Google's Asylo project.
+
+* cpp/src/arrow/result.h is based on status_or.h
+
+Copyright (c)  Copyright 2017 Asylo authors
+Homepage: https://asylo.dev/
+License: Apache 2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from Google's protobuf project
+
+* cpp/src/arrow/result.h ARROW_ASSIGN_OR_RAISE is based off ASSIGN_OR_RETURN
+
+Copyright 2008 Google Inc.  All rights reserved.
+Homepage: https://developers.google.com/protocol-buffers/
+License:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Code generated by the Protocol Buffer compiler is owned by the owner
+of the input file used when generating it.  This code is not
+standalone and requires a support library to be linked with it.  This
+support library is itself covered by the above license.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency LLVM is statically linked in certain binary distributions.
+Additionally some sections of source code have been derived from sources in LLVM
+and have been clearly labeled as such. LLVM has the following license:
+
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+Copyrights and Licenses for Third Party Software Distributed with LLVM:
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+Google Test         llvm/utils/unittest/googletest
+OpenBSD regex       llvm/lib/Support/{reg*, COPYRIGHT.regex}
+pyyaml tests        llvm/test/YAMLParser/{*.data, LICENSE.TXT}
+ARM contributions   llvm/lib/Target/ARM/LICENSE.TXT
+md5 contributions   llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency gRPC is statically linked in certain binary
+distributions, like the python wheels. gRPC has the following license:
+
+Copyright 2014 gRPC authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency Apache Thrift is statically linked in certain binary
+distributions, like the python wheels. Apache Thrift has the following license:
+
+Apache Thrift
+Copyright (C) 2006 - 2019, The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency Apache ORC is statically linked in certain binary
+distributions, like the python wheels. Apache ORC has the following license:
+
+Apache ORC
+Copyright 2013-2019 The Apache Software Foundation
+
+This product includes software developed by The Apache Software
+Foundation (http://www.apache.org/).
+
+This product includes software developed by Hewlett-Packard:
+(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency zstd is statically linked in certain binary
+distributions, like the python wheels. ZSTD has the following license:
+
+BSD License
+
+For Zstandard software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency lz4 is statically linked in certain binary
+distributions, like the python wheels. lz4 has the following license:
+
+LZ4 Library
+Copyright (c) 2011-2016, Yann Collet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency Brotli is statically linked in certain binary
+distributions, like the python wheels. Brotli has the following license:
+
+Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency rapidjson is statically linked in certain binary
+distributions, like the python wheels. rapidjson and its dependencies have the
+following licenses:
+
+Tencent is pleased to support the open source community by making RapidJSON
+available.
+
+Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+All rights reserved.
+
+If you have downloaded a copy of the RapidJSON binary from Tencent, please note
+that the RapidJSON binary is licensed under the MIT License.
+If you have downloaded a copy of the RapidJSON source code from Tencent, please
+note that RapidJSON source code is licensed under the MIT License, except for
+the third-party components listed below which are subject to different license
+terms.  Your integration of RapidJSON into your own projects may require
+compliance with the MIT License, as well as the other licenses applicable to
+the third-party components included within RapidJSON. To avoid the problematic
+JSON license in your own projects, it's sufficient to exclude the
+bin/jsonchecker/ directory, as it's the only code under the JSON license.
+A copy of the MIT License is included in this file.
+
+Other dependencies and licenses:
+
+    Open Source Software Licensed Under the BSD License:
+    --------------------------------------------------------------------
+
+    The msinttypes r29
+    Copyright (c) 2006-2013 Alexander Chemeris
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+    * Neither the name of  copyright holder nor the names of its contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
+    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR
+    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+    DAMAGE.
+
+    Open Source Software Licensed Under the JSON License:
+    --------------------------------------------------------------------
+
+    json.org
+    Copyright (c) 2002 JSON.org
+    All Rights Reserved.
+
+    JSON_checker
+    Copyright (c) 2002 JSON.org
+    All Rights Reserved.
+
+
+    Terms of the JSON License:
+    ---------------------------------------------------
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+    The Software shall be used for Good, not Evil.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+
+    Terms of the MIT License:
+    --------------------------------------------------------------------
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency snappy is statically linked in certain binary
+distributions, like the python wheels. snappy has the following license:
+
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Google Inc. nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+===
+
+Some of the benchmark data in testdata/ is licensed differently:
+
+ - fireworks.jpeg is Copyright 2013 Steinar H. Gunderson, and
+   is licensed under the Creative Commons Attribution 3.0 license
+   (CC-BY-3.0). See https://creativecommons.org/licenses/by/3.0/
+   for more information.
+
+ - kppkn.gtb is taken from the Gaviota chess tablebase set, and
+   is licensed under the MIT License. See
+   https://sites.google.com/site/gaviotachessengine/Home/endgame-tablebases-1
+   for more information.
+
+ - paper-100k.pdf is an excerpt (bytes 92160 to 194560) from the paper
+   “Combinatorial Modeling of Chromatin Features Quantitatively Predicts DNA
+   Replication Timing in _Drosophila_” by Federico Comoglio and Renato Paro,
+   which is licensed under the CC-BY license. See
+   http://www.ploscompbiol.org/static/license for more ifnormation.
+
+ - alice29.txt, asyoulik.txt, plrabn12.txt and lcet10.txt are from Project
+   Gutenberg. The first three have expired copyrights and are in the public
+   domain; the latter does not have expired copyright, but is still in the
+   public domain according to the license information
+   (http://www.gutenberg.org/ebooks/53).
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency gflags is statically linked in certain binary
+distributions, like the python wheels. gflags has the following license:
+
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency glog is statically linked in certain binary
+distributions, like the python wheels. glog has the following license:
+
+Copyright (c) 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+A function gettimeofday in utilities.cc is based on
+
+http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd
+
+The license of this code is:
+
+Copyright (c) 2003-2008, Jouni Malinen <j@w1.fi> and contributors
+All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the name(s) of the above-listed copyright holder(s) nor the
+   names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency re2 is statically linked in certain binary
+distributions, like the python wheels. re2 has the following license:
+
+Copyright (c) 2009 The RE2 Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its contributors
+      may be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency c-ares is statically linked in certain binary
+distributions, like the python wheels. c-ares has the following license:
+
+# c-ares license
+
+Copyright (c) 2007 - 2018, Daniel Stenberg with many contributors, see AUTHORS
+file.
+
+Copyright 1998 by the Massachusetts Institute of Technology.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted, provided that
+the above copyright notice appear in all copies and that both that copyright
+notice and this permission notice appear in supporting documentation, and that
+the name of M.I.T. not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior permission.
+M.I.T. makes no representations about the suitability of this software for any
+purpose.  It is provided "as is" without express or implied warranty.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency zlib is redistributed as a dynamically linked shared
+library in certain binary distributions, like the python wheels. In the future
+this will likely change to static linkage. zlib has the following license:
+
+zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.11, January 15th, 2017
+
+  Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency openssl is redistributed as a dynamically linked shared
+library in certain binary distributions, like the python wheels. openssl
+preceding version 3 has the following license:
+
+  LICENSE ISSUES
+  ==============
+
+  The OpenSSL toolkit stays under a double license, i.e. both the conditions of
+  the OpenSSL License and the original SSLeay license apply to the toolkit.
+  See below for the actual license texts.
+
+  OpenSSL License
+  ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2019 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+--------------------------------------------------------------------------------
+
+This project includes code from the rtools-backports project.
+
+* ci/scripts/PKGBUILD and ci/scripts/r_windows_build.sh are based on code
+  from the rtools-backports project.
+
+Copyright: Copyright (c) 2013 - 2019, Алексей and Jeroen Ooms.
+All rights reserved.
+Homepage: https://github.com/r-windows/rtools-backports
+License: 3-clause BSD
+
+--------------------------------------------------------------------------------
+
+Some code from pandas has been adapted for the pyarrow codebase. pandas is
+available under the 3-clause BSD license, which follows:
+
+pandas license
+==============
+
+Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team
+All rights reserved.
+
+Copyright (c) 2008-2011 AQR Capital Management, LLC
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the copyright holder nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+Some bits from DyND, in particular aspects of the build system, have been
+adapted from libdynd and dynd-python under the terms of the BSD 2-clause
+license
+
+The BSD 2-Clause License
+
+    Copyright (C) 2011-12, Dynamic NDArray Developers
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+
+        * Redistributions in binary form must reproduce the above
+           copyright notice, this list of conditions and the following
+           disclaimer in the documentation and/or other materials provided
+           with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Dynamic NDArray Developers list:
+
+ * Mark Wiebe
+ * Continuum Analytics
+
+--------------------------------------------------------------------------------
+
+Some source code from Ibis (https://github.com/cloudera/ibis) has been adapted
+for PyArrow. Ibis is released under the Apache License, Version 2.0.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the autobrew project.
+
+* r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
+  are based on code from the autobrew project.
+
+Copyright (c) 2019, Jeroen Ooms
+License: MIT
+Homepage: https://github.com/jeroen/autobrew
+
+--------------------------------------------------------------------------------
+
+dev/tasks/homebrew-formulae/apache-arrow.rb has the following license:
+
+BSD 2-Clause License
+
+Copyright (c) 2009-present, Homebrew contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------------------------------------------------------------
+
+cpp/src/arrow/vendored/base64.cpp has the following license
+
+ZLIB License
+
+Copyright (C) 2004-2017 René Nyffenegger
+
+This source code is provided 'as-is', without any express or implied
+warranty. In no event will the author be held liable for any damages arising
+from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose, including
+commercial applications, and to alter it and redistribute it freely, subject to
+the following restrictions:
+
+1. The origin of this source code must not be misrepresented; you must not
+   claim that you wrote the original source code. If you use this source code
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original source code.
+
+3. This notice may not be removed or altered from any source distribution.
+
+René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/vendored/optional.hpp has the following license
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/vendored/musl/strptime.c has the following license
+
+Copyright © 2005-2020 Rich Felker, et al.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/go/parquet/doc.go b/go/parquet/doc.go
new file mode 100644
index 00000000000..cf87b81826e
--- /dev/null
+++ b/go/parquet/doc.go
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package parquet provides an implementation of Apache Parquet for Go.
+//
+// Apache Parquet is an open-source columnar data storage format using the record
+// shredding and assembly algorithm to accomodate complex data structures which
+// can then be used to efficiently store the data.
+//
+// This implementation is a native go implementation for reading and writing the
+// parquet file format.
+//
+// Install
+//
+// You can download the library via:
+//   go get -u github.com/apache/arrow/go/parquet
+//
+// In addition, two cli utilities are provided:
+// 	go install github.factset.com/mtopol/parquet-go/cmd/parquet_reader
+// 	go install github.factset.com/mtopol/parquet-go/cmd/parquet_schema
+//
+// Modules
+//
+// This top level parquet package contains the basic common types and reader/writer
+// properties along with some utilities that are used throughout the other modules.
+//
+// The file module contains the functions for directly reading/writing parquet files
+// including Column Readers and Column Writers.
+//
+// The metadata module contains the types for managing the lower level file/rowgroup/column
+// metadata inside of a ParquetFile including inspecting the statistics.
+//
+// The pqarrow module contains helper functions and types for converting directly
+// between Parquet and Apache Arrow formats.
+//
+// The schema module contains the types for manipulating / inspecting / creating
+// parquet file schemas.
+//
+// Primitive Types
+//
+// The Parquet Primitive Types and their corresponding Go types are Boolean (bool),
+// Int32 (int32), Int64 (int64), Int96 (parquet.Int96), Float (float32), Double (float64),
+// ByteArray (parquet.ByteArray) and FixedLenByteArray (parquet.FixedLenByteArray).
+//
+// Encodings
+//
+// The encoding types supported in this package are:
+// Plain, Plain/RLE Dictionary, Delta Binary Packed (only integer types), Delta Byte Array
+// (only ByteArray), Delta Length Byte Array (only ByteArray)
+//
+// Tip: Some platforms don't necessarily support all kinds of encodings. If you're not
+// sure what to use, just use Plain and Dictionary encoding.
+package parquet
+
+//go:generate thrift -o internal -r --gen go ../../cpp/src/parquet/parquet.thrift
diff --git a/go/parquet/go.mod b/go/parquet/go.mod
new file mode 100644
index 00000000000..0f36a8dea73
--- /dev/null
+++ b/go/parquet/go.mod
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module github.com/apache/arrow/go/parquet
+
+go 1.15
+
+require (
+	github.com/apache/arrow/go/arrow v0.0.0-20210310173904-5de02e3697aa
+	github.com/klauspost/asmfmt v1.2.3
+	github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8
+	github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3
+	github.com/stretchr/testify v1.7.0
+	golang.org/x/exp v0.0.0-20210220032938-85be41e4509f
+	golang.org/x/sys v0.0.0-20210309074719-68d13333faf2
+	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1
+	gonum.org/v1/gonum v0.8.2
+)
diff --git a/go/parquet/go.sum b/go/parquet/go.sum
new file mode 100644
index 00000000000..60aa68a5953
--- /dev/null
+++ b/go/parquet/go.sum
@@ -0,0 +1,147 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
+github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
+github.com/apache/arrow/go/arrow v0.0.0-20210310173904-5de02e3697aa h1:0Bhiab9ep1wmbD1Lm17uqPkzgYhcBIZf1CsvrMhFMGI=
+github.com/apache/arrow/go/arrow v0.0.0-20210310173904-5de02e3697aa/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/google/flatbuffers v1.11.0 h1:O7CEyB8Cb3/DmtxODGtLHcEvpr81Jm5qLg/hsHnxA2A=
+github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
+github.com/klauspost/asmfmt v1.2.3 h1:qEM7SLDo6DXXXz5yTpqUoxhsrtwH30nNR2riO2ZjznY=
+github.com/klauspost/asmfmt v1.2.3/go.mod h1:RAoUvqkWr2rUa2I19qKMEVZQe4BVtcHGTMCUOcCU2Lg=
+github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
+github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
+github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
+github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190731235908-ec7cb31e5a56/go.mod h1:JhuoJpWY28nO4Vef9tZUw9qufEGTyX1+7lmHxV5q5G4=
+golang.org/x/exp v0.0.0-20210220032938-85be41e4509f h1:GrkO5AtFUU9U/1f5ctbIBXtBGeSJbWwIYfIsTcFMaX4=
+golang.org/x/exp v0.0.0-20210220032938-85be41e4509f/go.mod h1:I6l2HNBLBZEcrOoCpyKLdY2lHoRZ8lI4x60KMCQDft4=
+golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
+golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
+golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
+golang.org/x/mobile v0.0.0-20201217150744-e6ae53a27f4f/go.mod h1:skQtrUTUwhdJvXM/2KKJzY8pDgNr9I/FOMqDVRPBUS4=
+golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/mod v0.1.1-0.20191209134235-331c550502dd/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210309074719-68d13333faf2 h1:46ULzRKLh1CwgRq2dC5SlBzEqqNCi8rreOZnNrbqcIY=
+golang.org/x/sys v0.0.0-20210309074719-68d13333faf2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200117012304-6edc0a871e69/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
+gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
+gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
+gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
+google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
diff --git a/go/parquet/internal/bmi/Makefile b/go/parquet/internal/bmi/Makefile
new file mode 100644
index 00000000000..138b4f1cee4
--- /dev/null
+++ b/go/parquet/internal/bmi/Makefile
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PERL_FIXUP_ROTATE=perl -i -pe 's/(ro[rl]\s+\w{2,3})$$/\1, 1/'
+C2GOASM=c2goasm -a -f
+CC=clang
+C_FLAGS=-masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 \
+				-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib
+ASM_FLAGS_AVX2=-mavx2 -mfma
+ASM_FLAGS_BMI2=-mbmi2
+ASM_FLAGS_POPCNT=-mpopcnt
+
+GO_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go')
+ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go')
+
+INTEL_SOURCES := \
+	bitmap_bmi2.s
+
+.PHONEY: assembly
+
+assembly: $(INTEL_SOURCES)
+
+_lib/bitmap_bmi2.s: _lib/bitmap_bmi2.c
+	$(CC) -S $(ASM_FLAGS_AVX2) $(ASM_FLAGS_BMI2) $(ASM_FLAGS_POPCNT) $(C_FLAGS)  $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+bitmap_bmi2.s: _lib/bitmap_bmi2.s
+	$(C2GOASM) $^ $@
diff --git a/go/parquet/internal/bmi/_lib/bitmap_bmi2.c b/go/parquet/internal/bmi/_lib/bitmap_bmi2.c
new file mode 100644
index 00000000000..a49feb08767
--- /dev/null
+++ b/go/parquet/internal/bmi/_lib/bitmap_bmi2.c
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <x86intrin.h>
+
+uint64_t extract_bits(uint64_t bitmap, uint64_t select_bitmap) {
+   return (uint64_t)(_pext_u64(bitmap, select_bitmap));
+}
+
+uint64_t levels_to_bitmap(const int16_t* levels, const int num_levels, const int16_t rhs) {
+  uint64_t mask = 0;
+  for (int x = 0; x < num_levels; x++) {
+    mask |= (uint64_t)(levels[x] > rhs ? 1 : 0) << x;
+  }
+  return mask;
+}
diff --git a/go/parquet/internal/bmi/_lib/bitmap_bmi2.s b/go/parquet/internal/bmi/_lib/bitmap_bmi2.s
new file mode 100644
index 00000000000..20d482af4b7
--- /dev/null
+++ b/go/parquet/internal/bmi/_lib/bitmap_bmi2.s
@@ -0,0 +1,140 @@
+	.text
+	.intel_syntax noprefix
+	.file	"bitmap_bmi2.c"
+	.globl	extract_bits                    # -- Begin function extract_bits
+	.p2align	4, 0x90
+	.type	extract_bits,@function
+extract_bits:                           # @extract_bits
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	pext	rax, rdi, rsi
+	mov	rsp, rbp
+	pop	rbp
+	ret
+.Lfunc_end0:
+	.size	extract_bits, .Lfunc_end0-extract_bits
+                                        # -- End function
+	.section	.rodata.cst32,"aM",@progbits,32
+	.p2align	5                               # -- Begin function levels_to_bitmap
+.LCPI1_0:
+	.quad	0                               # 0x0
+	.quad	1                               # 0x1
+	.quad	2                               # 0x2
+	.quad	3                               # 0x3
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3
+.LCPI1_1:
+	.quad	4                               # 0x4
+.LCPI1_2:
+	.quad	8                               # 0x8
+.LCPI1_3:
+	.quad	12                              # 0xc
+.LCPI1_4:
+	.quad	1                               # 0x1
+.LCPI1_5:
+	.quad	16                              # 0x10
+	.text
+	.globl	levels_to_bitmap
+	.p2align	4, 0x90
+	.type	levels_to_bitmap,@function
+levels_to_bitmap:                       # @levels_to_bitmap
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	test	esi, esi
+	jle	.LBB1_1
+# %bb.2:
+	mov	r8d, esi
+	cmp	esi, 15
+	ja	.LBB1_4
+# %bb.3:
+	xor	esi, esi
+	xor	eax, eax
+	jmp	.LBB1_7
+.LBB1_1:
+	xor	eax, eax
+	jmp	.LBB1_8
+.LBB1_4:
+	mov	esi, r8d
+	and	esi, -16
+	vmovd	xmm0, edx
+	vpbroadcastw	xmm1, xmm0
+	vpxor	xmm0, xmm0, xmm0
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI1_0] # ymm2 = [0,1,2,3]
+	vpbroadcastq	ymm12, qword ptr [rip + .LCPI1_1] # ymm12 = [4,4,4,4]
+	vpbroadcastq	ymm4, qword ptr [rip + .LCPI1_2] # ymm4 = [8,8,8,8]
+	vpbroadcastq	ymm5, qword ptr [rip + .LCPI1_3] # ymm5 = [12,12,12,12]
+	vpbroadcastq	ymm6, qword ptr [rip + .LCPI1_4] # ymm6 = [1,1,1,1]
+	vpbroadcastq	ymm7, qword ptr [rip + .LCPI1_5] # ymm7 = [16,16,16,16]
+	xor	eax, eax
+	vpxor	xmm8, xmm8, xmm8
+	vpxor	xmm9, xmm9, xmm9
+	vpxor	xmm10, xmm10, xmm10
+	.p2align	4, 0x90
+.LBB1_5:                                # =>This Inner Loop Header: Depth=1
+	vpaddq	ymm11, ymm12, ymm2
+	vmovq	xmm3, qword ptr [rdi + 2*rax + 8] # xmm3 = mem[0],zero
+	vpcmpgtw	xmm3, xmm3, xmm1
+	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+	vpand	ymm3, ymm3, ymm6
+	vpsllvq	ymm3, ymm3, ymm11
+	vpaddq	ymm11, ymm2, ymm4
+	vpor	ymm8, ymm8, ymm3
+	vmovq	xmm3, qword ptr [rdi + 2*rax + 16] # xmm3 = mem[0],zero
+	vpcmpgtw	xmm3, xmm3, xmm1
+	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+	vpand	ymm3, ymm3, ymm6
+	vpsllvq	ymm3, ymm3, ymm11
+	vpaddq	ymm11, ymm2, ymm5
+	vpor	ymm9, ymm9, ymm3
+	vmovq	xmm3, qword ptr [rdi + 2*rax + 24] # xmm3 = mem[0],zero
+	vpcmpgtw	xmm3, xmm3, xmm1
+	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+	vpand	ymm3, ymm3, ymm6
+	vpsllvq	ymm3, ymm3, ymm11
+	vpor	ymm10, ymm10, ymm3
+	vmovq	xmm3, qword ptr [rdi + 2*rax]   # xmm3 = mem[0],zero
+	vpcmpgtw	xmm3, xmm3, xmm1
+	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+	vpand	ymm3, ymm3, ymm6
+	vpsllvq	ymm3, ymm3, ymm2
+	vpor	ymm0, ymm3, ymm0
+	add	rax, 16
+	vpaddq	ymm2, ymm2, ymm7
+	cmp	rsi, rax
+	jne	.LBB1_5
+# %bb.6:
+	vpor	ymm0, ymm8, ymm0
+	vpor	ymm0, ymm9, ymm0
+	vpor	ymm0, ymm10, ymm0
+	vextracti128	xmm1, ymm0, 1
+	vpor	xmm0, xmm0, xmm1
+	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
+	vpor	xmm0, xmm0, xmm1
+	vmovq	rax, xmm0
+	cmp	rsi, r8
+	je	.LBB1_8
+	.p2align	4, 0x90
+.LBB1_7:                                # =>This Inner Loop Header: Depth=1
+	xor	ecx, ecx
+	cmp	word ptr [rdi + 2*rsi], dx
+	setg	cl
+	shlx	rcx, rcx, rsi
+	or	rax, rcx
+	add	rsi, 1
+	cmp	r8, rsi
+	jne	.LBB1_7
+.LBB1_8:
+	mov	rsp, rbp
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end1:
+	.size	levels_to_bitmap, .Lfunc_end1-levels_to_bitmap
+                                        # -- End function
+	.ident	"Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
diff --git a/go/parquet/internal/bmi/bitmap_bmi2.go b/go/parquet/internal/bmi/bitmap_bmi2.go
new file mode 100644
index 00000000000..ce09b3fd01d
--- /dev/null
+++ b/go/parquet/internal/bmi/bitmap_bmi2.go
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bmi
+
+import "unsafe"
+
+//go:noescape
+func _extract_bits(bitmap, selectBitmap uint64) (res uint64)
+
+// extractBitsBMI2 uses BMI2 to call the pext instruction, Parallel Bits Extract
+// in order to quickly and efficiently extract the bits selected in a parallel
+// fashion. See the definition of the PEXT instruction for x86/x86-64 cpus
+func extractBitsBMI2(bitmap, selectBitmap uint64) uint64 {
+	return _extract_bits(bitmap, selectBitmap)
+}
+
+//go:noescape
+func _levels_to_bitmap(levels unsafe.Pointer, numLevels int, rhs int16) (res uint64)
+
+// greaterThanBitmapBMI2 builds a bitmap where each set bit indicates the corresponding level
+// is greater than the rhs value.
+func greaterThanBitmapBMI2(levels []int16, rhs int16) uint64 {
+	if len(levels) == 0 {
+		return 0
+	}
+
+	var (
+		p1 = unsafe.Pointer(&levels[0])
+		p2 = len(levels)
+		p3 = rhs
+	)
+
+	return _levels_to_bitmap(p1, p2, p3)
+}
diff --git a/go/parquet/internal/bmi/bitmap_bmi2.s b/go/parquet/internal/bmi/bitmap_bmi2.s
new file mode 100644
index 00000000000..c81794d4c4c
--- /dev/null
+++ b/go/parquet/internal/bmi/bitmap_bmi2.s
@@ -0,0 +1,117 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+TEXT ·_extract_bits(SB), $0-24
+
+	MOVQ bitmap+0(FP), DI
+	MOVQ selectBitmap+8(FP), SI
+
+	LONG $0xf5c2e2c4; BYTE $0xc6 // pext    rax, rdi, rsi
+	MOVQ AX, res+16(FP)
+	RET
+
+DATA LCDATA1<>+0x000(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x008(SB)/8, $0x0000000000000001
+DATA LCDATA1<>+0x010(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x018(SB)/8, $0x0000000000000003
+DATA LCDATA1<>+0x020(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x028(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x030(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x038(SB)/8, $0x0000000000000001
+DATA LCDATA1<>+0x040(SB)/8, $0x0000000000000010
+GLOBL LCDATA1<>(SB), 8, $72
+
+TEXT ·_levels_to_bitmap(SB), $0-32
+
+	MOVQ levels+0(FP), DI
+	MOVQ numLevels+8(FP), SI
+	MOVQ rhs+16(FP), DX
+	LEAQ LCDATA1<>(SB), BP
+
+	WORD $0xf685             // test    esi, esi
+	JLE  LBB1_1
+	WORD $0x8941; BYTE $0xf0 // mov    r8d, esi
+	WORD $0xfe83; BYTE $0x0f // cmp    esi, 15
+	JA   LBB1_4
+	WORD $0xf631             // xor    esi, esi
+	WORD $0xc031             // xor    eax, eax
+	JMP  LBB1_7
+
+LBB1_1:
+	WORD $0xc031 // xor    eax, eax
+	JMP  LBB1_8
+
+LBB1_4:
+	WORD $0x8944; BYTE $0xc6       // mov    esi, r8d
+	WORD $0xe683; BYTE $0xf0       // and    esi, -16
+	LONG $0xc26ef9c5               // vmovd    xmm0, edx
+	LONG $0x7979e2c4; BYTE $0xc8   // vpbroadcastw    xmm1, xmm0
+	LONG $0xc0eff9c5               // vpxor    xmm0, xmm0, xmm0
+	LONG $0x556ffdc5; BYTE $0x00   // vmovdqa    ymm2, yword 0[rbp] /* [rip + .LCPI1_0] */
+	LONG $0x597d62c4; WORD $0x2065 // vpbroadcastq    ymm12, qword 32[rbp] /* [rip + .LCPI1_1] */
+	LONG $0x597de2c4; WORD $0x2865 // vpbroadcastq    ymm4, qword 40[rbp] /* [rip + .LCPI1_2] */
+	LONG $0x597de2c4; WORD $0x306d // vpbroadcastq    ymm5, qword 48[rbp] /* [rip + .LCPI1_3] */
+	LONG $0x597de2c4; WORD $0x3875 // vpbroadcastq    ymm6, qword 56[rbp] /* [rip + .LCPI1_4] */
+	LONG $0x597de2c4; WORD $0x407d // vpbroadcastq    ymm7, qword 64[rbp] /* [rip + .LCPI1_5] */
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0xef3941c4; BYTE $0xc0   // vpxor    xmm8, xmm8, xmm8
+	LONG $0xef3141c4; BYTE $0xc9   // vpxor    xmm9, xmm9, xmm9
+	LONG $0xef2941c4; BYTE $0xd2   // vpxor    xmm10, xmm10, xmm10
+
+LBB1_5:
+	LONG $0xdad41dc5               // vpaddq    ymm11, ymm12, ymm2
+	LONG $0x5c7efac5; WORD $0x0847 // vmovq    xmm3, qword [rdi + 2*rax + 8]
+	LONG $0xd965e1c5               // vpcmpgtw    xmm3, xmm3, xmm1
+	LONG $0x347de2c4; BYTE $0xdb   // vpmovzxwq    ymm3, xmm3
+	LONG $0xdedbe5c5               // vpand    ymm3, ymm3, ymm6
+	LONG $0x47e5c2c4; BYTE $0xdb   // vpsllvq    ymm3, ymm3, ymm11
+	LONG $0xdcd46dc5               // vpaddq    ymm11, ymm2, ymm4
+	LONG $0xc3eb3dc5               // vpor    ymm8, ymm8, ymm3
+	LONG $0x5c7efac5; WORD $0x1047 // vmovq    xmm3, qword [rdi + 2*rax + 16]
+	LONG $0xd965e1c5               // vpcmpgtw    xmm3, xmm3, xmm1
+	LONG $0x347de2c4; BYTE $0xdb   // vpmovzxwq    ymm3, xmm3
+	LONG $0xdedbe5c5               // vpand    ymm3, ymm3, ymm6
+	LONG $0x47e5c2c4; BYTE $0xdb   // vpsllvq    ymm3, ymm3, ymm11
+	LONG $0xddd46dc5               // vpaddq    ymm11, ymm2, ymm5
+	LONG $0xcbeb35c5               // vpor    ymm9, ymm9, ymm3
+	LONG $0x5c7efac5; WORD $0x1847 // vmovq    xmm3, qword [rdi + 2*rax + 24]
+	LONG $0xd965e1c5               // vpcmpgtw    xmm3, xmm3, xmm1
+	LONG $0x347de2c4; BYTE $0xdb   // vpmovzxwq    ymm3, xmm3
+	LONG $0xdedbe5c5               // vpand    ymm3, ymm3, ymm6
+	LONG $0x47e5c2c4; BYTE $0xdb   // vpsllvq    ymm3, ymm3, ymm11
+	LONG $0xd3eb2dc5               // vpor    ymm10, ymm10, ymm3
+	LONG $0x1c7efac5; BYTE $0x47   // vmovq    xmm3, qword [rdi + 2*rax]
+	LONG $0xd965e1c5               // vpcmpgtw    xmm3, xmm3, xmm1
+	LONG $0x347de2c4; BYTE $0xdb   // vpmovzxwq    ymm3, xmm3
+	LONG $0xdedbe5c5               // vpand    ymm3, ymm3, ymm6
+	LONG $0x47e5e2c4; BYTE $0xda   // vpsllvq    ymm3, ymm3, ymm2
+	LONG $0xc0ebe5c5               // vpor    ymm0, ymm3, ymm0
+	LONG $0x10c08348               // add    rax, 16
+	LONG $0xd7d4edc5               // vpaddq    ymm2, ymm2, ymm7
+	WORD $0x3948; BYTE $0xc6       // cmp    rsi, rax
+	JNE  LBB1_5
+	LONG $0xc0ebbdc5               // vpor    ymm0, ymm8, ymm0
+	LONG $0xc0ebb5c5               // vpor    ymm0, ymm9, ymm0
+	LONG $0xc0ebadc5               // vpor    ymm0, ymm10, ymm0
+	LONG $0x397de3c4; WORD $0x01c1 // vextracti128    xmm1, ymm0, 1
+	LONG $0xc1ebf9c5               // vpor    xmm0, xmm0, xmm1
+	LONG $0xc870f9c5; BYTE $0x4e   // vpshufd    xmm1, xmm0, 78
+	LONG $0xc1ebf9c5               // vpor    xmm0, xmm0, xmm1
+	LONG $0x7ef9e1c4; BYTE $0xc0   // vmovq    rax, xmm0
+	WORD $0x394c; BYTE $0xc6       // cmp    rsi, r8
+	JE   LBB1_8
+
+LBB1_7:
+	WORD $0xc931                 // xor    ecx, ecx
+	LONG $0x77143966             // cmp    word [rdi + 2*rsi], dx
+	WORD $0x9f0f; BYTE $0xd1     // setg    cl
+	LONG $0xf7c9e2c4; BYTE $0xc9 // shlx    rcx, rcx, rsi
+	WORD $0x0948; BYTE $0xc8     // or    rax, rcx
+	LONG $0x01c68348             // add    rsi, 1
+	WORD $0x3949; BYTE $0xf0     // cmp    r8, rsi
+	JNE  LBB1_7
+
+LBB1_8:
+	VZEROUPPER
+	MOVQ AX, res+24(FP)
+	RET
diff --git a/go/parquet/internal/bmi/bmi_init.go b/go/parquet/internal/bmi/bmi_init.go
new file mode 100644
index 00000000000..e82b8e556e5
--- /dev/null
+++ b/go/parquet/internal/bmi/bmi_init.go
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package bmi contains helpers for manipulating bitmaps via BMI2 extensions
+// properly falling back to pure go implementations if the CPU doesn't support
+// BMI2.
+package bmi
+
+import (
+	"golang.org/x/sys/cpu"
+)
+
+type funcs struct {
+	extractBits func(uint64, uint64) uint64
+	gtbitmap    func([]int16, int16) uint64
+}
+
+var funclist funcs
+
+func init() {
+	if cpu.X86.HasBMI2 {
+		funclist.extractBits = extractBitsBMI2
+	} else {
+		funclist.extractBits = extractBitsGo
+	}
+	if cpu.X86.HasAVX2 {
+		funclist.gtbitmap = greaterThanBitmapBMI2
+	} else {
+		funclist.gtbitmap = greaterThanBitmapGo
+	}
+}
+
+// ExtractBits performs a Parallel Bit extract as per the PEXT instruction for
+// x86/x86-64 cpus to use the second parameter as a mask to extract the bits from
+// the first argument into a new bitmap.
+//
+// For each bit Set in selectBitmap, the corresponding bits are extracted from bitmap
+// and written to contiguous lower bits of the result, the remaining upper bits are zeroed.
+func ExtractBits(bitmap, selectBitmap uint64) uint64 {
+	return funclist.extractBits(bitmap, selectBitmap)
+}
+
+// GreaterThanBitmap builds a bitmap where each bit corresponds to whether or not
+// the level in that index is greater than the value of rhs.
+func GreaterThanBitmap(levels []int16, rhs int16) uint64 {
+	return funclist.gtbitmap(levels, rhs)
+}
diff --git a/go/parquet/internal/bmi/bmi_noasm.go b/go/parquet/internal/bmi/bmi_noasm.go
new file mode 100644
index 00000000000..7ebb19597ee
--- /dev/null
+++ b/go/parquet/internal/bmi/bmi_noasm.go
@@ -0,0 +1,249 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bmi
+
+import "math/bits"
+
+/* Python code to generate lookup table:
+kLookupBits = 5
+count = 0
+print('constexpr int kLookupBits = {};'.format(kLookupBits))
+print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
+print(' ', end = '')
+for mask in range(1 << kLookupBits):
+    for data in range(1 << kLookupBits):
+        bit_value = 0
+        bit_len = 0
+        for i in range(kLookupBits):
+            if mask & (1 << i):
+                bit_value |= (((data >> i) & 1) << bit_len)
+                bit_len += 1
+        out = '0x{:02X},'.format(bit_value)
+        count += 1
+        if count % (1 << kLookupBits) == 1:
+            print(' {')
+        if count % 8 == 1:
+            print('    ', end = '')
+        if count % 8 == 0:
+            print(out, end = '\n')
+        else:
+            print(out, end = ' ')
+        if count % (1 << kLookupBits) == 0:
+            print('  },', end = '')
+print('\n};')
+*/
+
+const lookupBits = 5
+
+var pextTable = [1 << lookupBits][1 << lookupBits]uint8{
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+		0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+		0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+		0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+		0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+		0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+		0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
+	},
+	{
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+		0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+		0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+		0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+		0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+		0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+		0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+		0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+		0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	},
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+		0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+		0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+		0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+		0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+		0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+		0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+	},
+	{
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+		0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+		0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+		0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+		0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+		0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+		0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+		0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+		0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+	},
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+		0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+		0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+		0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
+		0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+		0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
+		0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+	},
+	{
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+		0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
+		0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+		0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
+		0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+		0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
+		0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+		0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+		0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+	},
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+		0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+		0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
+		0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+		0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
+		0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+		0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
+		0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
+	},
+	{
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+		0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
+		0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
+	},
+	{
+		0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+		0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
+		0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
+	},
+	{
+		0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+		0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
+		0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+		0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+		0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+	},
+}
+
+// software emulation of _pext_u64
+func extractBitsGo(bitmap, selectBitmap uint64) uint64 {
+	if selectBitmap == ^uint64(0) {
+		return bitmap
+	} else if selectBitmap == 0 {
+		return 0
+	}
+
+	// fallback to lookup table method
+	bitValue := uint64(0)
+	bitLen := int(0)
+	const lookupMask = uint64((uint(1) << lookupBits) - 1)
+
+	for selectBitmap != 0 {
+		maskLen := bits.OnesCount32(uint32(selectBitmap & lookupMask))
+		value := pextTable[selectBitmap&lookupMask][bitmap&lookupMask]
+		bitValue |= uint64(value << bitLen)
+		bitLen += maskLen
+		bitmap >>= lookupBits
+		selectBitmap >>= lookupBits
+	}
+	return bitValue
+}
+
+func greaterThanBitmapGo(levels []int16, rhs int16) uint64 {
+	mask := uint64(0)
+	for idx, lvl := range levels {
+		if lvl > rhs {
+			mask |= uint64(1) << idx
+		} else {
+			mask |= uint64(0) << idx
+		}
+	}
+	return mask
+}
diff --git a/go/parquet/internal/testutils/random.go b/go/parquet/internal/testutils/random.go
new file mode 100644
index 00000000000..0ed0943905c
--- /dev/null
+++ b/go/parquet/internal/testutils/random.go
@@ -0,0 +1,452 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutils contains utilities for generating random data and other
+// helpers that are used for testing the various aspects of the parquet library.
+package testutils
+
+import (
+	"math"
+	"time"
+	"unsafe"
+
+	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet"
+
+	"golang.org/x/exp/rand"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+// RandomArrayGenerator is a struct used for constructing Random Arrow arrays
+// for use with testing.
+type RandomArrayGenerator struct {
+	seed     uint64
+	extra    uint64
+	src      rand.Source
+	seedRand *rand.Rand
+}
+
+// NewRandomArrayGenerator constructs a new generator with the requested Seed
+func NewRandomArrayGenerator(seed uint64) RandomArrayGenerator {
+	src := rand.NewSource(seed)
+	return RandomArrayGenerator{seed, 0, src, rand.New(src)}
+}
+
+// GenerateBitmap generates a bitmap of n bits and stores it into buffer. Prob is the probability
+// that a given bit will be zero, with 1-prob being the probability it will be 1. The return value
+// is the number of bits that were left unset. The assumption being that buffer is currently
+// zero initialized as this function does not clear any bits, it only sets 1s.
+func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float64) int64 {
+	count := int64(0)
+	r.extra++
+
+	// bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1,
+	// which we'll use to generate the bitmap.
+	dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)}
+	for i := 0; int64(i) < n; i++ {
+		if dist.Rand() != float64(0.0) {
+			bitutil.SetBit(buffer, i)
+		} else {
+			count++
+		}
+	}
+
+	return count
+}
+
+// ByteArray creates an array.String for use of creating random ByteArray values for testing parquet
+// writing/reading. minLen/maxLen are the min and max length for a given value in the resulting array,
+// with nullProb being the probability of a given index being null.
+//
+// For this generation we only generate ascii values with a min of 'A' and max of 'z'.
+func (r *RandomArrayGenerator) ByteArray(size int64, minLen, maxLen int32, nullProb float64) array.Interface {
+	if nullProb < 0 || nullProb > 1 {
+		panic("null prob must be between 0 and 1")
+	}
+
+	lengths := r.Int32(size, minLen, maxLen, nullProb)
+	defer lengths.Release()
+
+	r.extra++
+	dist := rand.New(rand.NewSource(r.seed + r.extra))
+	bldr := array.NewStringBuilder(memory.DefaultAllocator)
+	defer bldr.Release()
+
+	strbuf := make([]byte, maxLen)
+
+	for i := 0; int64(i) < size; i++ {
+		if lengths.IsValid(i) {
+			l := lengths.Value(i)
+			for j := int32(0); j < l; j++ {
+				strbuf[j] = byte(dist.Int31n(int32('z')-int32('A')+1) + int32('A'))
+			}
+			val := strbuf[:l]
+			bldr.Append(*(*string)(unsafe.Pointer(&val)))
+		} else {
+			bldr.AppendNull()
+		}
+	}
+
+	return bldr.NewArray()
+}
+
+// Uint8 generates a random array.Uint8 of the requested size whose values are between min and max
+// with prob as the probability that a given index will be null.
+func (r *RandomArrayGenerator) Uint8(size int64, min, max uint8, prob float64) array.Interface {
+	buffers := make([]*memory.Buffer, 2)
+	nullCount := int64(0)
+
+	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[0].Resize(int(bitutil.BytesForBits(size)))
+	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, prob)
+
+	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[1].Resize(int(size * int64(arrow.Uint8SizeBytes)))
+
+	r.extra++
+	dist := rand.New(rand.NewSource(r.seed + r.extra))
+	out := arrow.Uint8Traits.CastFromBytes(buffers[1].Bytes())
+	for i := int64(0); i < size; i++ {
+		out[i] = uint8(dist.Intn(int(max-min+1))) + min
+	}
+
+	return array.NewUint8Data(array.NewData(arrow.PrimitiveTypes.Uint8, int(size), buffers, nil, int(nullCount), 0))
+}
+
+// Int32 generates a random array.Int32 of the given size with each value between min and max,
+// and pctNull as the probability that a given index will be null.
+func (r *RandomArrayGenerator) Int32(size int64, min, max int32, pctNull float64) *array.Int32 {
+	buffers := make([]*memory.Buffer, 2)
+	nullCount := int64(0)
+
+	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[0].Resize(int(bitutil.BytesForBits(size)))
+	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
+
+	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[1].Resize(arrow.Int32Traits.BytesRequired(int(size)))
+
+	r.extra++
+	dist := rand.New(rand.NewSource(r.seed + r.extra))
+	out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes())
+	for i := int64(0); i < size; i++ {
+		out[i] = dist.Int31n(max-min+1) + min
+	}
+	return array.NewInt32Data(array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullCount), 0))
+}
+
+// Float64 generates a random array.Float64 of the requested size with pctNull as the probability
+// that a given index will be null.
+func (r *RandomArrayGenerator) Float64(size int64, pctNull float64) *array.Float64 {
+	buffers := make([]*memory.Buffer, 2)
+	nullCount := int64(0)
+
+	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[0].Resize(int(bitutil.BytesForBits(size)))
+	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
+
+	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[1].Resize(arrow.Float64Traits.BytesRequired(int(size)))
+
+	r.extra++
+	dist := rand.New(rand.NewSource(r.seed + r.extra))
+	out := arrow.Float64Traits.CastFromBytes(buffers[1].Bytes())
+	for i := int64(0); i < size; i++ {
+		out[i] = dist.NormFloat64()
+	}
+	return array.NewFloat64Data(array.NewData(arrow.PrimitiveTypes.Float64, int(size), buffers, nil, int(nullCount), 0))
+}
+
+// FillRandomInt8 populates the slice out with random int8 values between min and max using
+// seed as the random see for generation to allow consistency for testing.
+func FillRandomInt8(seed uint64, min, max int8, out []int8) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = int8(r.Intn(int(max-min+1))) + min
+	}
+}
+
+// FillRandomUint8 populates the slice out with random uint8 values between min and max using
+// seed as the random see for generation to allow consistency for testing.
+func FillRandomUint8(seed uint64, min, max uint8, out []uint8) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = uint8(r.Intn(int(max-min+1))) + min
+	}
+}
+
+// FillRandomInt16 populates the slice out with random int16 values between min and max using
+// seed as the random see for generation to allow consistency for testing.
+func FillRandomInt16(seed uint64, min, max int16, out []int16) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = int16(r.Intn(int(max-min+1))) + min
+	}
+}
+
+// FillRandomUint16 populates the slice out with random uint16 values between min and max using
+// seed as the random see for generation to allow consistency for testing.
+func FillRandomUint16(seed uint64, min, max uint16, out []uint16) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = uint16(r.Intn(int(max-min+1))) + min
+	}
+}
+
+// FillRandomInt32 populates out with random int32 values using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomInt32(seed uint64, out []int32) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = int32(r.Uint32())
+	}
+}
+
+// FillRandomInt32Max populates out with random int32 values between 0 and max using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomInt32Max(seed uint64, max int32, out []int32) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = r.Int31n(max)
+	}
+}
+
+// FillRandomUint32Max populates out with random uint32 values between 0 and max using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomUint32Max(seed uint64, max uint32, out []uint32) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = uint32(r.Uint64n(uint64(max)))
+	}
+}
+
+// FillRandomInt64Max populates out with random int64 values between 0 and max using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomInt64Max(seed uint64, max int64, out []int64) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = r.Int63n(max)
+	}
+}
+
+// FillRandomUint32 populates out with random uint32 values using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomUint32(seed uint64, out []uint32) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = r.Uint32()
+	}
+}
+
+// FillRandomUint64 populates out with random uint64 values using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomUint64(seed uint64, out []uint64) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = r.Uint64()
+	}
+}
+
+// FillRandomUint64Max populates out with random uint64 values between 0 and max using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomUint64Max(seed uint64, max uint64, out []uint64) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = r.Uint64n(max)
+	}
+}
+
+// FillRandomInt64 populates out with random int64 values using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomInt64(seed uint64, out []int64) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = int64(r.Uint64())
+	}
+}
+
+// FillRandomInt96 populates out with random Int96 values using seed as the random
+// seed for the generator to allow consistency for testing. It does this by generating
+// three random uint32 values for each int96 value.
+func FillRandomInt96(seed uint64, out []parquet.Int96) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		*(*int32)(unsafe.Pointer(&out[idx][0])) = int32(r.Uint32())
+		*(*int32)(unsafe.Pointer(&out[idx][4])) = int32(r.Uint32())
+		*(*int32)(unsafe.Pointer(&out[idx][8])) = int32(r.Uint32())
+	}
+}
+
+// randFloat32 creates a random float value with a normal distribution
+// to better spread the values out and ensure we do not return any NaN values.
+func randFloat32(r *rand.Rand) float32 {
+	for {
+		f := math.Float32frombits(r.Uint32())
+		if !math.IsNaN(float64(f)) {
+			return f
+		}
+	}
+}
+
+// randFloat64 creates a random float value with a normal distribution
+// to better spread the values out and ensure we do not return any NaN values.
+func randFloat64(r *rand.Rand) float64 {
+	for {
+		f := math.Float64frombits(r.Uint64())
+		if !math.IsNaN(f) {
+			return f
+		}
+	}
+}
+
+// FillRandomFloat32 populates out with random float32 values using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomFloat32(seed uint64, out []float32) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = randFloat32(r)
+	}
+}
+
+// FillRandomFloat64 populates out with random float64 values using seed as the random
+// seed for the generator to allow consistency for testing.
+func FillRandomFloat64(seed uint64, out []float64) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = randFloat64(r)
+	}
+}
+
+// FillRandomByteArray populates out with random ByteArray values with lengths between 2 and 12
+// using heap as the actual memory storage used for the bytes generated. Each element of
+// out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices.
+func FillRandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer) {
+	const (
+		maxByteArrayLen = 12
+		minByteArrayLen = 2
+	)
+	RandomByteArray(seed, out, heap, minByteArrayLen, maxByteArrayLen)
+}
+
+// FillRandomFixedByteArray populates out with random FixedLenByteArray values with of a length equal to size
+// using heap as the actual memory storage used for the bytes generated. Each element of
+// out will be a slice of size bytes in heap, and as such heap must outlive the byte array slices.
+func FillRandomFixedByteArray(seed uint64, out []parquet.FixedLenByteArray, heap *memory.Buffer, size int) {
+	heap.Resize(len(out) * size)
+
+	buf := heap.Bytes()
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		r.Read(buf[:size])
+		out[idx] = buf[:size]
+		buf = buf[size:]
+	}
+}
+
+// FillRandomBooleans populates out with random bools with the probability p of being false using
+// seed as the random seed to the generator in order to allow consistency for testing. This uses
+// a Bernoulli distribution of values.
+func FillRandomBooleans(p float64, seed uint64, out []bool) {
+	dist := distuv.Bernoulli{P: p, Src: rand.NewSource(seed)}
+	for idx := range out {
+		out[idx] = dist.Rand() != float64(0.0)
+	}
+}
+
+// fillRandomIsValid populates out with random bools with the probability pctNull of being false using
+// seed as the random seed to the generator in order to allow consistency for testing. This uses
+// the default Golang random generator distribution of float64 values between 0 and 1 comparing against
+// pctNull. If the random value is > pctNull, it is true.
+func fillRandomIsValid(seed uint64, pctNull float64, out []bool) {
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		out[idx] = r.Float64() > pctNull
+	}
+}
+
+// InitValues is a convenience function for generating a slice of random values based on the type.
+// If the type is parquet.ByteArray or parquet.FixedLenByteArray, heap must not be null.
+//
+// The default values are:
+//  []bool uses the current time as the seed with only values of 1 being false, for use
+//   of creating validity boolean slices.
+//  all other types use 0 as the seed
+//  a []parquet.ByteArray is populated with lengths between 2 and 12
+//  a []parquet.FixedLenByteArray is populated with fixed size random byte arrays of length 12.
+func InitValues(values interface{}, heap *memory.Buffer) {
+	switch arr := values.(type) {
+	case []bool:
+		fillRandomIsValid(uint64(time.Now().Unix()), 1.0, arr)
+	case []int32:
+		FillRandomInt32(0, arr)
+	case []int64:
+		FillRandomInt64(0, arr)
+	case []float32:
+		FillRandomFloat32(0, arr)
+	case []float64:
+		FillRandomFloat64(0, arr)
+	case []parquet.Int96:
+		FillRandomInt96(0, arr)
+	case []parquet.ByteArray:
+		FillRandomByteArray(0, arr, heap)
+	case []parquet.FixedLenByteArray:
+		FillRandomFixedByteArray(0, arr, heap, 12)
+	}
+}
+
+// RandomByteArray populates out with random ByteArray values with lengths between minlen and maxlen
+// using heap as the actual memory storage used for the bytes generated. Each element of
+// out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices.
+func RandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer, minlen, maxlen int) {
+	heap.Resize(len(out) * (maxlen + arrow.Uint32SizeBytes))
+
+	buf := heap.Bytes()
+	r := rand.New(rand.NewSource(seed))
+	for idx := range out {
+		length := r.Intn(maxlen-minlen+1) + minlen
+		r.Read(buf[:length])
+		out[idx] = buf[:length]
+
+		buf = buf[length:]
+	}
+}
+
+// // RandomDecimals generates n random decimal values with precision determining the byte width
+// // for the values and seed as the random generator seed to allow consistency for testing. The
+// // resulting values will be either 32 bytes or 16 bytes each depending on the precision.
+// func RandomDecimals(n int64, seed uint64, precision int32) []byte {
+// 	r := rand.New(rand.NewSource(seed))
+// 	nreqBytes := pqarrow.DecimalSize(precision)
+// 	byteWidth := 32
+// 	if precision <= 38 {
+// 		byteWidth = 16
+// 	}
+
+// 	out := make([]byte, int(int64(byteWidth)*n))
+// 	for i := int64(0); i < n; i++ {
+// 		start := int(i) * byteWidth
+// 		r.Read(out[start : start+int(nreqBytes)])
+// 		// sign extend if the sign bit is set for the last generated byte
+// 		// 0b10000000 == 0x80 == 128
+// 		if out[start+int(nreqBytes)-1]&byte(0x80) != 0 {
+// 			memory.Set(out[start+int(nreqBytes):start+byteWidth], 0xFF)
+// 		}
+// 	}
+// 	return out
+// }
diff --git a/go/parquet/internal/testutils/random_arrow.go b/go/parquet/internal/testutils/random_arrow.go
new file mode 100644
index 00000000000..c3edf6b489a
--- /dev/null
+++ b/go/parquet/internal/testutils/random_arrow.go
@@ -0,0 +1,488 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutils
+
+import (
+	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/arrow/memory"
+	"golang.org/x/exp/rand"
+)
+
+// RandomNonNull generates a random arrow array of the requested type with length size with no nulls.
+// Accepts float32, float64, all integer primitives, Date32, date64, string, binary, fixed_size_binary, bool and decimal.
+//
+// Always uses 0 as the seed with the following min/max restrictions:
+// int16, uint16, int8, and uint8 will be min 0, max 64
+// Date32 and Date64 will be between 0 and 24 * 86400000 in increments of 86400000
+// String will all have the value "test-string"
+// binary will have each value between length 2 and 12 but random bytes that are not limited to ascii
+// fixed size binary will all be of length 10, random bytes are not limited to ascii
+// bool will be approximately half false and half true randomly.
+func RandomNonNull(dt arrow.DataType, size int) array.Interface {
+	switch dt.ID() {
+	case arrow.FLOAT32:
+		bldr := array.NewFloat32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]float32, size)
+		FillRandomFloat32(0, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.FLOAT64:
+		bldr := array.NewFloat64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]float64, size)
+		FillRandomFloat64(0, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.INT64:
+		bldr := array.NewInt64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int64, size)
+		FillRandomInt64(0, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.UINT64:
+		bldr := array.NewUint64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint64, size)
+		FillRandomUint64(0, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.INT32:
+		bldr := array.NewInt32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int32, size)
+		FillRandomInt32(0, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.UINT32:
+		bldr := array.NewUint32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint32, size)
+		FillRandomUint32(0, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.INT16:
+		bldr := array.NewInt16Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int16, size)
+		FillRandomInt16(0, 0, 64, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.UINT16:
+		bldr := array.NewUint16Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint16, size)
+		FillRandomUint16(0, 0, 64, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.INT8:
+		bldr := array.NewInt8Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int8, size)
+		FillRandomInt8(0, 0, 64, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.UINT8:
+		bldr := array.NewUint8Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint8, size)
+		FillRandomUint8(0, 0, 64, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	case arrow.DATE32:
+		bldr := array.NewDate32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int32, size)
+		FillRandomInt32Max(0, 24, values)
+
+		dates := make([]arrow.Date32, size)
+		for idx, val := range values {
+			dates[idx] = arrow.Date32(val) * 86400000
+		}
+		bldr.AppendValues(dates, nil)
+		return bldr.NewArray()
+	case arrow.DATE64:
+		bldr := array.NewDate64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int64, size)
+		FillRandomInt64Max(0, 24, values)
+
+		dates := make([]arrow.Date64, size)
+		for idx, val := range values {
+			dates[idx] = arrow.Date64(val) * 86400000
+		}
+		bldr.AppendValues(dates, nil)
+		return bldr.NewArray()
+	case arrow.STRING:
+		bldr := array.NewStringBuilder(memory.DefaultAllocator)
+		defer bldr.Release()
+		for i := 0; i < size; i++ {
+			bldr.Append("test-string")
+		}
+		return bldr.NewArray()
+	case arrow.BINARY:
+		bldr := array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
+		defer bldr.Release()
+
+		buf := make([]byte, 12)
+		r := rand.New(rand.NewSource(0))
+		for i := 0; i < size; i++ {
+			length := r.Intn(12-2+1) + 2
+			r.Read(buf[:length])
+			bldr.Append(buf[:length])
+		}
+		return bldr.NewArray()
+	case arrow.FIXED_SIZE_BINARY:
+		bldr := array.NewFixedSizeBinaryBuilder(memory.DefaultAllocator, &arrow.FixedSizeBinaryType{ByteWidth: 10})
+		defer bldr.Release()
+
+		buf := make([]byte, 10)
+		r := rand.New(rand.NewSource(0))
+		for i := 0; i < size; i++ {
+			r.Read(buf)
+			bldr.Append(buf)
+		}
+		return bldr.NewArray()
+	// case arrow.DECIMAL:
+	// 	dectype := dt.(*arrow.Decimal128Type)
+	// 	bldr := array.NewDecimal128Builder(memory.DefaultAllocator, dectype)
+	// 	defer bldr.Release()
+
+	// 	data := RandomDecimals(int64(size), 0, dectype.Precision)
+	// 	bldr.AppendValues(arrow.Decimal128Traits.CastFromBytes(data), nil)
+	// 	return bldr.NewArray()
+	case arrow.BOOL:
+		bldr := array.NewBooleanBuilder(memory.DefaultAllocator)
+		defer bldr.Release()
+
+		values := make([]bool, size)
+		FillRandomBooleans(0.5, 0, values)
+		bldr.AppendValues(values, nil)
+		return bldr.NewArray()
+	}
+	return nil
+}
+
+// RandomNullable generates a random arrow array of length size with approximately numNulls,
+// at most there can be size/2 nulls. Other than there being nulls, the values follow the same rules
+// as described in the docs for RandomNonNull.
+func RandomNullable(dt arrow.DataType, size int, numNulls int) array.Interface {
+	switch dt.ID() {
+	case arrow.FLOAT32:
+		bldr := array.NewFloat32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]float32, size)
+		FillRandomFloat32(0, values)
+
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.FLOAT64:
+		bldr := array.NewFloat64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]float64, size)
+		FillRandomFloat64(0, values)
+
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.INT8:
+		bldr := array.NewInt8Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int8, size)
+		FillRandomInt8(0, 0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.UINT8:
+		bldr := array.NewUint8Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint8, size)
+		FillRandomUint8(0, 0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.INT16:
+		bldr := array.NewInt16Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int16, size)
+		FillRandomInt16(0, 0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.UINT16:
+		bldr := array.NewUint16Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint16, size)
+		FillRandomUint16(0, 0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.INT32:
+		bldr := array.NewInt32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int32, size)
+		FillRandomInt32Max(0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.UINT32:
+		bldr := array.NewUint32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint32, size)
+		FillRandomUint32Max(0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+
+	case arrow.INT64:
+		bldr := array.NewInt64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int64, size)
+		FillRandomInt64Max(0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.UINT64:
+		bldr := array.NewUint64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]uint64, size)
+		FillRandomUint64Max(0, 64, values)
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	case arrow.DATE32:
+		bldr := array.NewDate32Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int32, size)
+		FillRandomInt32Max(0, 24, values)
+
+		dates := make([]arrow.Date32, size)
+		for idx, val := range values {
+			dates[idx] = arrow.Date32(val) * 86400000
+		}
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+		bldr.AppendValues(dates, valid)
+		return bldr.NewArray()
+	case arrow.DATE64:
+		bldr := array.NewDate64Builder(memory.DefaultAllocator)
+		defer bldr.Release()
+		values := make([]int64, size)
+		FillRandomInt64Max(0, 24, values)
+
+		dates := make([]arrow.Date64, size)
+		for idx, val := range values {
+			dates[idx] = arrow.Date64(val) * 86400000
+		}
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+		bldr.AppendValues(dates, valid)
+		return bldr.NewArray()
+	case arrow.BINARY:
+		bldr := array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
+		defer bldr.Release()
+
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		buf := make([]byte, 12)
+		r := rand.New(rand.NewSource(0))
+		for i := 0; i < size; i++ {
+			if !valid[i] {
+				bldr.AppendNull()
+				continue
+			}
+
+			length := r.Intn(12-2+1) + 2
+			r.Read(buf[:length])
+			bldr.Append(buf[:length])
+		}
+		return bldr.NewArray()
+	case arrow.STRING:
+		bldr := array.NewStringBuilder(memory.DefaultAllocator)
+		defer bldr.Release()
+
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		buf := make([]byte, 12)
+		r := rand.New(rand.NewSource(0))
+		for i := 0; i < size; i++ {
+			if !valid[i] {
+				bldr.AppendNull()
+				continue
+			}
+
+			length := r.Intn(12-2+1) + 2
+			r.Read(buf[:length])
+			// trivially force data to be valid UTF8 by making it all ASCII
+			for idx := range buf[:length] {
+				buf[idx] &= 0x7f
+			}
+			bldr.Append(string(buf[:length]))
+		}
+		return bldr.NewArray()
+	case arrow.FIXED_SIZE_BINARY:
+		bldr := array.NewFixedSizeBinaryBuilder(memory.DefaultAllocator, &arrow.FixedSizeBinaryType{ByteWidth: 10})
+		defer bldr.Release()
+
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		buf := make([]byte, 10)
+		r := rand.New(rand.NewSource(0))
+		for i := 0; i < size; i++ {
+			if !valid[i] {
+				bldr.AppendNull()
+				continue
+			}
+
+			r.Read(buf)
+			bldr.Append(buf)
+		}
+		return bldr.NewArray()
+	// case arrow.DECIMAL:
+	// 	dectype := dt.(*arrow.Decimal128Type)
+	// 	bldr := array.NewDecimal128Builder(memory.DefaultAllocator, dectype)
+	// 	defer bldr.Release()
+
+	// 	valid := make([]bool, size)
+	// 	for idx := range valid {
+	// 		valid[idx] = true
+	// 	}
+	// 	for i := 0; i < numNulls; i++ {
+	// 		valid[i*2] = false
+	// 	}
+
+	// 	data := RandomDecimals(int64(size), 0, dectype.Precision)
+	// 	bldr.AppendValues(arrow.Decimal128Traits.CastFromBytes(data), valid)
+	// 	return bldr.NewArray()
+	case arrow.BOOL:
+		bldr := array.NewBooleanBuilder(memory.DefaultAllocator)
+		defer bldr.Release()
+
+		valid := make([]bool, size)
+		for idx := range valid {
+			valid[idx] = true
+		}
+		for i := 0; i < numNulls; i++ {
+			valid[i*2] = false
+		}
+
+		values := make([]bool, size)
+		FillRandomBooleans(0.5, 0, values)
+		bldr.AppendValues(values, valid)
+		return bldr.NewArray()
+	}
+	return nil
+}
diff --git a/go/parquet/internal/utils/Makefile b/go/parquet/internal/utils/Makefile
new file mode 100644
index 00000000000..39057ae1f2e
--- /dev/null
+++ b/go/parquet/internal/utils/Makefile
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this converts rotate instructions from "ro[lr] <reg>" -> "ro[lr] <reg>, 1" for yasm compatibility
+PERL_FIXUP_ROTATE=perl -i -pe 's/(ro[rl]\s+\w{2,3})$$/\1, 1/'
+
+C2GOASM=c2goasm
+CC=clang
+C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 \
+				-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib
+ASM_FLAGS_AVX2=-mavx2 -mfma -mllvm -force-vector-width=32
+ASM_FLAGS_SSE4=-msse4
+ASM_FLAGS_BMI2=-mbmi2
+ASM_FLAGS_POPCNT=-mpopcnt
+
+GO_SOURCES  := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go')
+ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go')
+
+.PHONEY: assembly
+
+INTEL_SOURCES := \
+	bit_packing_avx2.s min_max_avx2.s min_max_sse4.s \
+	unpack_bool_avx2.s unpack_bool_sse4.s
+
+assembly: $(INTEL_SOURCES)
+
+_lib/bit_packing_avx2.s: _lib/bit_packing_avx2.c
+	$(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@; perl -i -pe 's/mem(cpy|set)/clib·_mem\1(SB)/' $@
+
+_lib/min_max_avx2.s: _lib/min_max.c
+	$(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/min_max_sse4.s: _lib/min_max.c
+	$(CC) -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/unpack_bool_avx2.s: _lib/unpack_bool.c
+	$(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/unpack_bool_sse4.s: _lib/unpack_bool.c
+	$(CC) -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+bit_packing_avx2.s: _lib/bit_packing_avx2.s
+	$(C2GOASM) -a -f $^ $@
+
+min_max_avx2.s: _lib/min_max_avx2.s
+	$(C2GOASM) -a -f $^ $@
+
+min_max_sse4.s: _lib/min_max_sse4.s
+	$(C2GOASM) -a -f $^ $@
+
+unpack_bool_avx2.s: _lib/unpack_bool_avx2.s
+	$(C2GOASM) -a -f $^ $@
+
+unpack_bool_sse4.s: _lib/unpack_bool_sse4.s
+	$(C2GOASM) -a -f $^ $@
diff --git a/go/parquet/internal/utils/_lib/arch.h b/go/parquet/internal/utils/_lib/arch.h
new file mode 100644
index 00000000000..9579539a844
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/arch.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#undef FULL_NAME
+
+#if defined(__AVX2__)
+    #define FULL_NAME(x) x##_avx2
+#elif __SSE4_2__ == 1
+    #define FULL_NAME(x) x##_sse4
+#elif __SSE3__ == 1
+    #define FULL_NAME(x) x##_sse3
+#else
+    #define FULL_NAME(x) x##_x86
+#endif
diff --git a/go/parquet/internal/utils/_lib/bit_packing_avx2.c b/go/parquet/internal/utils/_lib/bit_packing_avx2.c
new file mode 100644
index 00000000000..b57f24fd5c7
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/bit_packing_avx2.c
@@ -0,0 +1,1879 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <immintrin.h>
+#include <string.h>
+
+inline const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {
+  memset(out, 0x0, 32 * sizeof(*out));
+  out += 32;
+
+  return in;
+}
+
+inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(7, 6, 5, 4,
+                               3, 2, 1, 0);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(15, 14, 13, 12,
+                                11, 10, 9, 8);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(23, 22, 21, 20,
+                                19, 18, 17, 16);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(31, 30, 29, 28,
+                                27, 26, 25, 24);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 1;
+
+  return in;
+}
+
+inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
+                               6, 4, 2, 0);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
+                                22, 20, 18, 16);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0],
+                              in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
+                                6, 4, 2, 0);
+  reg_inls = _mm256_set_epi32(in[1], in[1],
+                              in[1], in[1],
+                              in[1], in[1],
+                              in[1], in[1]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
+                                22, 20, 18, 16);
+  reg_inls = _mm256_set_epi32(in[1], in[1],
+                              in[1], in[1],
+                              in[1], in[1],
+                              in[1], in[1]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 2;
+
+  return in;
+}
+
+inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(21, 18, 15, 12,
+                               9, 6, 3, 0);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(13, 10, 7, 4,
+                                1, 0, 27, 24);
+  reg_inls = _mm256_set_epi32(in[1], in[1],
+                              in[1], in[1],
+                              in[1], in[0] >> 30 | in[1] << 2,
+                              in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(5, 2, 0, 28,
+                                25, 22, 19, 16);
+  reg_inls = _mm256_set_epi32(in[2], in[2],
+                              in[1] >> 31 | in[2] << 1, in[1],
+                              in[1], in[1],
+                              in[1], in[1]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(29, 26, 23, 20,
+                                17, 14, 11, 8);
+  reg_inls = _mm256_set_epi32(in[2], in[2],
+                              in[2], in[2],
+                              in[2], in[2],
+                              in[2], in[2]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 3;
+
+  return in;
+}
+
+inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0xf;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
+                               12, 8, 4, 0);
+  reg_inls = _mm256_set_epi32(in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
+                                12, 8, 4, 0);
+  reg_inls = _mm256_set_epi32(in[1], in[1],
+                              in[1], in[1],
+                              in[1], in[1],
+                              in[1], in[1]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
+                                12, 8, 4, 0);
+  reg_inls = _mm256_set_epi32(in[2], in[2],
+                              in[2], in[2],
+                              in[2], in[2],
+                              in[2], in[2]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
+                                12, 8, 4, 0);
+  reg_inls = _mm256_set_epi32(in[3], in[3],
+                              in[3], in[3],
+                              in[3], in[3],
+                              in[3], in[3]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 4;
+
+  return in;
+}
+
+inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1f;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(3, 0, 25, 20,
+                               15, 10, 5, 0);
+  reg_inls = _mm256_set_epi32(in[1], in[0] >> 30 | in[1] << 2,
+                             in[0], in[0],
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(11, 6, 1, 0,
+                                23, 18, 13, 8);
+  reg_inls = _mm256_set_epi32(in[2], in[2],
+                              in[2], in[1] >> 28 | in[2] << 4,
+                              in[1], in[1],
+                              in[1], in[1]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(19, 14, 9, 4,
+                                0, 26, 21, 16);
+  reg_inls = _mm256_set_epi32(in[3], in[3],
+                              in[3], in[3],
+                              in[2] >> 31 | in[3] << 1, in[2],
+                              in[2], in[2]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(27, 22, 17, 12,
+                                7, 2, 0, 24);
+  reg_inls = _mm256_set_epi32(in[4], in[4],
+                              in[4], in[4],
+                              in[4], in[4],
+                              in[3] >> 29 | in[4] << 3, in[3]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 5;
+
+  return in;
+}
+
+inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3f;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
+                               18, 12, 6, 0);
+  reg_inls = _mm256_set_epi32(in[1], in[1],
+                             in[0] >> 30 | in[1] << 2, in[0],
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
+                                2, 0, 22, 16);
+  reg_inls = _mm256_set_epi32(in[2], in[2],
+                              in[2], in[2],
+                              in[2], in[1] >> 28 | in[2] << 4,
+                              in[1], in[1]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
+                                18, 12, 6, 0);
+  reg_inls = _mm256_set_epi32(in[4], in[4],
+                              in[3] >> 30 | in[4] << 2, in[3],
+                              in[3], in[3],
+                              in[3], in[3]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
+                                2, 0, 22, 16);
+  reg_inls = _mm256_set_epi32(in[5], in[5],
+                              in[5], in[5],
+                              in[5], in[4] >> 28 | in[5] << 4,
+                              in[4], in[4]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 6;
+
+  return in;
+}
+
+inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7f;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(17, 10, 3, 0,
+                               21, 14, 7, 0);
+  reg_inls = _mm256_set_epi32(in[1], in[1],
+                             in[1], in[0] >> 28 | in[1] << 4,
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(9, 2, 0, 20,
+                                13, 6, 0, 24);
+  reg_inls = _mm256_set_epi32(in[3], in[3],
+                              in[2] >> 27 | in[3] << 5, in[2],
+                              in[2], in[2],
+                              in[1] >> 31 | in[2] << 1, in[1]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(1, 0, 19, 12,
+                                5, 0, 23, 16);
+  reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
+                              in[4], in[4],
+                              in[4], in[3] >> 30 | in[4] << 2,
+                              in[3], in[3]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(25, 18, 11, 4,
+                                0, 22, 15, 8);
+  reg_inls = _mm256_set_epi32(in[6], in[6],
+                              in[6], in[6],
+                              in[5] >> 29 | in[6] << 3, in[5],
+                              in[5], in[5]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 7;
+
+  return in;
+}
+
+inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0xff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
+                               24, 16, 8, 0);
+  reg_inls = _mm256_set_epi32(in[1], in[1],
+                             in[1], in[1],
+                             in[0], in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
+                                24, 16, 8, 0);
+  reg_inls = _mm256_set_epi32(in[3], in[3],
+                              in[3], in[3],
+                              in[2], in[2],
+                              in[2], in[2]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
+                                24, 16, 8, 0);
+  reg_inls = _mm256_set_epi32(in[5], in[5],
+                              in[5], in[5],
+                              in[4], in[4],
+                              in[4], in[4]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
+                                24, 16, 8, 0);
+  reg_inls = _mm256_set_epi32(in[7], in[7],
+                              in[7], in[7],
+                              in[6], in[6],
+                              in[6], in[6]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 8;
+
+  return in;
+}
+
+inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1ff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 22, 13, 4,
+                               0, 18, 9, 0);
+  reg_inls = _mm256_set_epi32(in[1] >> 31 | in[2] << 1, in[1],
+                             in[1], in[1],
+                             in[0] >> 27 | in[1] << 5, in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(7, 0, 21, 12,
+                                3, 0, 17, 8);
+  reg_inls = _mm256_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
+                              in[3], in[3],
+                              in[3], in[2] >> 26 | in[3] << 6,
+                              in[2], in[2]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(15, 6, 0, 20,
+                                11, 2, 0, 16);
+  reg_inls = _mm256_set_epi32(in[6], in[6],
+                              in[5] >> 29 | in[6] << 3, in[5],
+                              in[5], in[5],
+                              in[4] >> 25 | in[5] << 7, in[4]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(23, 14, 5, 0,
+                                19, 10, 1, 0);
+  reg_inls = _mm256_set_epi32(in[8], in[8],
+                              in[8], in[7] >> 28 | in[8] << 4,
+                              in[7], in[7],
+                              in[7], in[6] >> 24 | in[7] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 9;
+
+  return in;
+}
+
+inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3ff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
+                               0, 20, 10, 0);
+  reg_inls = _mm256_set_epi32(in[2], in[1] >> 28 | in[2] << 4,
+                             in[1], in[1],
+                             in[0] >> 30 | in[1] << 2, in[0],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
+                                14, 4, 0, 16);
+  reg_inls = _mm256_set_epi32(in[4], in[4],
+                              in[4], in[3] >> 24 | in[4] << 8,
+                              in[3], in[3],
+                              in[2] >> 26 | in[3] << 6, in[2]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
+                                0, 20, 10, 0);
+  reg_inls = _mm256_set_epi32(in[7], in[6] >> 28 | in[7] << 4,
+                              in[6], in[6],
+                              in[5] >> 30 | in[6] << 2, in[5],
+                              in[5], in[5]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
+                                14, 4, 0, 16);
+  reg_inls = _mm256_set_epi32(in[9], in[9],
+                              in[9], in[8] >> 24 | in[9] << 8,
+                              in[8], in[8],
+                              in[7] >> 26 | in[8] << 6, in[7]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 10;
+
+  return in;
+}
+
+inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7ff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(13, 2, 0, 12,
+                               1, 0, 11, 0);
+  reg_inls = _mm256_set_epi32(in[2], in[2],
+                             in[1] >> 23 | in[2] << 9, in[1],
+                             in[1], in[0] >> 22 | in[1] << 10,
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(5, 0, 15, 4,
+                                0, 14, 3, 0);
+  reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
+                              in[4], in[4],
+                              in[3] >> 25 | in[4] << 7, in[3],
+                              in[3], in[2] >> 24 | in[3] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 18, 7, 0,
+                                17, 6, 0, 16);
+  reg_inls = _mm256_set_epi32(in[7] >> 29 | in[8] << 3, in[7],
+                              in[7], in[6] >> 28 | in[7] << 4,
+                              in[6], in[6],
+                              in[5] >> 27 | in[6] << 5, in[5]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(21, 10, 0, 20,
+                                9, 0, 19, 8);
+  reg_inls = _mm256_set_epi32(in[10], in[10],
+                              in[9] >> 31 | in[10] << 1, in[9],
+                              in[9], in[8] >> 30 | in[9] << 2,
+                              in[8], in[8]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 11;
+
+  return in;
+}
+
+inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0xfff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
+                               4, 0, 12, 0);
+  reg_inls = _mm256_set_epi32(in[2], in[2],
+                             in[1] >> 28 | in[2] << 4, in[1],
+                             in[1], in[0] >> 24 | in[1] << 8,
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
+                                4, 0, 12, 0);
+  reg_inls = _mm256_set_epi32(in[5], in[5],
+                              in[4] >> 28 | in[5] << 4, in[4],
+                              in[4], in[3] >> 24 | in[4] << 8,
+                              in[3], in[3]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
+                                4, 0, 12, 0);
+  reg_inls = _mm256_set_epi32(in[8], in[8],
+                              in[7] >> 28 | in[8] << 4, in[7],
+                              in[7], in[6] >> 24 | in[7] << 8,
+                              in[6], in[6]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
+                                4, 0, 12, 0);
+  reg_inls = _mm256_set_epi32(in[11], in[11],
+                              in[10] >> 28 | in[11] << 4, in[10],
+                              in[10], in[9] >> 24 | in[10] << 8,
+                              in[9], in[9]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 12;
+
+  return in;
+}
+
+inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1fff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 14, 1, 0,
+                               7, 0, 13, 0);
+  reg_inls = _mm256_set_epi32(in[2] >> 27 | in[3] << 5, in[2],
+                             in[2], in[1] >> 20 | in[2] << 12,
+                             in[1], in[0] >> 26 | in[1] << 6,
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(3, 0, 9, 0,
+                                15, 2, 0, 8);
+  reg_inls = _mm256_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
+                              in[5], in[4] >> 28 | in[5] << 4,
+                              in[4], in[4],
+                              in[3] >> 21 | in[4] << 11, in[3]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(11, 0, 17, 4,
+                                0, 10, 0, 16);
+  reg_inls = _mm256_set_epi32(in[9], in[8] >> 30 | in[9] << 2,
+                              in[8], in[8],
+                              in[7] >> 23 | in[8] << 9, in[7],
+                              in[6] >> 29 | in[7] << 3, in[6]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(19, 6, 0, 12,
+                                0, 18, 5, 0);
+  reg_inls = _mm256_set_epi32(in[12], in[12],
+                              in[11] >> 25 | in[12] << 7, in[11],
+                              in[10] >> 31 | in[11] << 1, in[10],
+                              in[10], in[9] >> 24 | in[10] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 13;
+
+  return in;
+}
+
+inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3fff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
+                               10, 0, 14, 0);
+  reg_inls = _mm256_set_epi32(in[3], in[2] >> 20 | in[3] << 12,
+                             in[2], in[1] >> 24 | in[2] << 8,
+                             in[1], in[0] >> 28 | in[1] << 4,
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
+                                0, 12, 0, 16);
+  reg_inls = _mm256_set_epi32(in[6], in[6],
+                              in[5] >> 22 | in[6] << 10, in[5],
+                              in[4] >> 26 | in[5] << 6, in[4],
+                              in[3] >> 30 | in[4] << 2, in[3]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
+                                10, 0, 14, 0);
+  reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
+                              in[9], in[8] >> 24 | in[9] << 8,
+                              in[8], in[7] >> 28 | in[8] << 4,
+                              in[7], in[7]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
+                                0, 12, 0, 16);
+  reg_inls = _mm256_set_epi32(in[13], in[13],
+                              in[12] >> 22 | in[13] << 10, in[12],
+                              in[11] >> 26 | in[12] << 6, in[11],
+                              in[10] >> 30 | in[11] << 2, in[10]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 14;
+
+  return in;
+}
+
+inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7fff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(9, 0, 11, 0,
+                               13, 0, 15, 0);
+  reg_inls = _mm256_set_epi32(in[3], in[2] >> 26 | in[3] << 6,
+                             in[2], in[1] >> 28 | in[2] << 4,
+                             in[1], in[0] >> 30 | in[1] << 2,
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(1, 0, 3, 0,
+                                5, 0, 7, 0);
+  reg_inls = _mm256_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
+                              in[6], in[5] >> 20 | in[6] << 12,
+                              in[5], in[4] >> 22 | in[5] << 10,
+                              in[4], in[3] >> 24 | in[4] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 10, 0, 12,
+                                0, 14, 0, 16);
+  reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
+                              in[9] >> 27 | in[10] << 5, in[9],
+                              in[8] >> 29 | in[9] << 3, in[8],
+                              in[7] >> 31 | in[8] << 1, in[7]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(17, 2, 0, 4,
+                                0, 6, 0, 8);
+  reg_inls = _mm256_set_epi32(in[14], in[14],
+                              in[13] >> 19 | in[14] << 13, in[13],
+                              in[12] >> 21 | in[13] << 11, in[12],
+                              in[11] >> 23 | in[12] << 9, in[11]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 15;
+
+  return in;
+}
+
+inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0xffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
+                               16, 0, 16, 0);
+  reg_inls = _mm256_set_epi32(in[3], in[3],
+                             in[2], in[2],
+                             in[1], in[1],
+                             in[0], in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
+                                16, 0, 16, 0);
+  reg_inls = _mm256_set_epi32(in[7], in[7],
+                              in[6], in[6],
+                              in[5], in[5],
+                              in[4], in[4]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
+                                16, 0, 16, 0);
+  reg_inls = _mm256_set_epi32(in[11], in[11],
+                              in[10], in[10],
+                              in[9], in[9],
+                              in[8], in[8]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
+                                16, 0, 16, 0);
+  reg_inls = _mm256_set_epi32(in[15], in[15],
+                              in[14], in[14],
+                              in[13], in[13],
+                              in[12], in[12]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 16;
+
+  return in;
+}
+
+inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1ffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 6, 0, 4,
+                               0, 2, 0, 0);
+  reg_inls = _mm256_set_epi32(in[3] >> 23 | in[4] << 9, in[3],
+                             in[2] >> 21 | in[3] << 11, in[2],
+                             in[1] >> 19 | in[2] << 13, in[1],
+                             in[0] >> 17 | in[1] << 15, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 14, 0, 12,
+                                0, 10, 0, 8);
+  reg_inls = _mm256_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
+                              in[6] >> 29 | in[7] << 3, in[6],
+                              in[5] >> 27 | in[6] << 5, in[5],
+                              in[4] >> 25 | in[5] << 7, in[4]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(7, 0, 5, 0,
+                                3, 0, 1, 0);
+  reg_inls = _mm256_set_epi32(in[12], in[11] >> 22 | in[12] << 10,
+                              in[11], in[10] >> 20 | in[11] << 12,
+                              in[10], in[9] >> 18 | in[10] << 14,
+                              in[9], in[8] >> 16 | in[9] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(15, 0, 13, 0,
+                                11, 0, 9, 0);
+  reg_inls = _mm256_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
+                              in[15], in[14] >> 28 | in[15] << 4,
+                              in[14], in[13] >> 26 | in[14] << 6,
+                              in[13], in[12] >> 24 | in[13] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 17;
+
+  return in;
+}
+
+inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3ffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
+                               0, 4, 0, 0);
+  reg_inls = _mm256_set_epi32(in[3] >> 30 | in[4] << 2, in[3],
+                             in[2] >> 26 | in[3] << 6, in[2],
+                             in[1] >> 22 | in[2] << 10, in[1],
+                             in[0] >> 18 | in[1] << 14, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
+                                6, 0, 2, 0);
+  reg_inls = _mm256_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
+                              in[7], in[6] >> 24 | in[7] << 8,
+                              in[6], in[5] >> 20 | in[6] << 12,
+                              in[5], in[4] >> 16 | in[5] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
+                                0, 4, 0, 0);
+  reg_inls = _mm256_set_epi32(in[12] >> 30 | in[13] << 2, in[12],
+                              in[11] >> 26 | in[12] << 6, in[11],
+                              in[10] >> 22 | in[11] << 10, in[10],
+                              in[9] >> 18 | in[10] << 14, in[9]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
+                                6, 0, 2, 0);
+  reg_inls = _mm256_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
+                              in[16], in[15] >> 24 | in[16] << 8,
+                              in[15], in[14] >> 20 | in[15] << 12,
+                              in[14], in[13] >> 16 | in[14] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 18;
+
+  return in;
+}
+
+inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7ffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(5, 0, 0, 12,
+                               0, 6, 0, 0);
+  reg_inls = _mm256_set_epi32(in[4], in[3] >> 18 | in[4] << 14,
+                             in[2] >> 31 | in[3] << 1, in[2],
+                             in[1] >> 25 | in[2] << 7, in[1],
+                             in[0] >> 19 | in[1] << 13, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 10, 0, 4,
+                                0, 0, 11, 0);
+  reg_inls = _mm256_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
+                              in[7] >> 23 | in[8] << 9, in[7],
+                              in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
+                              in[5], in[4] >> 24 | in[5] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
+                                9, 0, 3, 0);
+  reg_inls = _mm256_set_epi32(in[13] >> 21 | in[14] << 11, in[13],
+                              in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
+                              in[11], in[10] >> 22 | in[11] << 10,
+                              in[10], in[9] >> 16 | in[10] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(13, 0, 7, 0,
+                                1, 0, 0, 8);
+  reg_inls = _mm256_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
+                              in[17], in[16] >> 20 | in[17] << 12,
+                              in[16], in[15] >> 14 | in[16] << 18,
+                              in[14] >> 27 | in[15] << 5, in[14]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 19;
+
+  return in;
+}
+
+inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0xfffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
+                               0, 8, 0, 0);
+  reg_inls = _mm256_set_epi32(in[4], in[3] >> 24 | in[4] << 8,
+                             in[3], in[2] >> 16 | in[3] << 16,
+                             in[1] >> 28 | in[2] << 4, in[1],
+                             in[0] >> 20 | in[1] << 12, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
+                                0, 8, 0, 0);
+  reg_inls = _mm256_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
+                              in[8], in[7] >> 16 | in[8] << 16,
+                              in[6] >> 28 | in[7] << 4, in[6],
+                              in[5] >> 20 | in[6] << 12, in[5]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
+                                0, 8, 0, 0);
+  reg_inls = _mm256_set_epi32(in[14], in[13] >> 24 | in[14] << 8,
+                              in[13], in[12] >> 16 | in[13] << 16,
+                              in[11] >> 28 | in[12] << 4, in[11],
+                              in[10] >> 20 | in[11] << 12, in[10]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
+                                0, 8, 0, 0);
+  reg_inls = _mm256_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
+                              in[18], in[17] >> 16 | in[18] << 16,
+                              in[16] >> 28 | in[17] << 4, in[16],
+                              in[15] >> 20 | in[16] << 12, in[15]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 20;
+
+  return in;
+}
+
+inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1fffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 9, 0,
+                               0, 10, 0, 0);
+  reg_inls = _mm256_set_epi32(in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
+                             in[3], in[2] >> 20 | in[3] << 12,
+                             in[1] >> 31 | in[2] << 1, in[1],
+                             in[0] >> 21 | in[1] << 11, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
+                                7, 0, 0, 8);
+  reg_inls = _mm256_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
+                              in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
+                              in[7], in[6] >> 18 | in[7] << 14,
+                              in[5] >> 29 | in[6] << 3, in[5]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(3, 0, 0, 4,
+                                0, 0, 5, 0);
+  reg_inls = _mm256_set_epi32(in[15], in[14] >> 14 | in[15] << 18,
+                              in[13] >> 25 | in[14] << 7, in[13],
+                              in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
+                              in[11], in[10] >> 16 | in[11] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(11, 0, 1, 0,
+                                0, 2, 0, 0);
+  reg_inls = _mm256_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
+                              in[19], in[18] >> 12 | in[19] << 20,
+                              in[17] >> 23 | in[18] << 9, in[17],
+                              in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 21;
+
+  return in;
+}
+
+inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3fffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
+                               2, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[4] >> 26 | in[5] << 6, in[4],
+                             in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
+                             in[2], in[1] >> 12 | in[2] << 20,
+                             in[0] >> 22 | in[1] << 10, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
+                                0, 0, 6, 0);
+  reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
+                              in[8] >> 30 | in[9] << 2, in[8],
+                              in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
+                              in[6], in[5] >> 16 | in[6] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
+                                2, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[15] >> 26 | in[16] << 6, in[15],
+                              in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
+                              in[13], in[12] >> 12 | in[13] << 20,
+                              in[11] >> 22 | in[12] << 10, in[11]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
+                                0, 0, 6, 0);
+  reg_inls = _mm256_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
+                              in[19] >> 30 | in[20] << 2, in[19],
+                              in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
+                              in[17], in[16] >> 16 | in[17] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 22;
+
+  return in;
+}
+
+inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7fffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
+                               5, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[5], in[4] >> 10 | in[5] << 22,
+                             in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
+                             in[2], in[1] >> 14 | in[2] << 18,
+                             in[0] >> 23 | in[1] << 9, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
+                                0, 6, 0, 0);
+  reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
+                              in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
+                              in[7] >> 29 | in[8] << 3, in[7],
+                              in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 3, 0,
+                                0, 0, 7, 0);
+  reg_inls = _mm256_set_epi32(in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
+                              in[15], in[14] >> 12 | in[15] << 20,
+                              in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
+                              in[12], in[11] >> 16 | in[12] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(9, 0, 0, 4,
+                                0, 0, 0, 8);
+  reg_inls = _mm256_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
+                              in[20] >> 27 | in[21] << 5, in[20],
+                              in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
+                              in[17] >> 31 | in[18] << 1, in[17]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 23;
+
+  return in;
+}
+
+inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0xffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
+                               8, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[5], in[4] >> 16 | in[5] << 16,
+                             in[3] >> 24 | in[4] << 8, in[3],
+                             in[2], in[1] >> 16 | in[2] << 16,
+                             in[0] >> 24 | in[1] << 8, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
+                                8, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
+                              in[9] >> 24 | in[10] << 8, in[9],
+                              in[8], in[7] >> 16 | in[8] << 16,
+                              in[6] >> 24 | in[7] << 8, in[6]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
+                                8, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[17], in[16] >> 16 | in[17] << 16,
+                              in[15] >> 24 | in[16] << 8, in[15],
+                              in[14], in[13] >> 16 | in[14] << 16,
+                              in[12] >> 24 | in[13] << 8, in[12]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
+                                8, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
+                              in[21] >> 24 | in[22] << 8, in[21],
+                              in[20], in[19] >> 16 | in[20] << 16,
+                              in[18] >> 24 | in[19] << 8, in[18]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 24;
+
+  return in;
+}
+
+inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1ffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
+                               0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
+                             in[3] >> 29 | in[4] << 3, in[3],
+                             in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
+                             in[0] >> 25 | in[1] << 7, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 5, 0,
+                                0, 0, 1, 0);
+  reg_inls = _mm256_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
+                              in[10], in[9] >> 12 | in[10] << 20,
+                              in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
+                              in[7], in[6] >> 8 | in[7] << 24);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
+                                0, 2, 0, 0);
+  reg_inls = _mm256_set_epi32(in[17] >> 31 | in[18] << 1, in[17],
+                              in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
+                              in[14] >> 27 | in[15] << 5, in[14],
+                              in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(7, 0, 0, 0,
+                                3, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
+                              in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
+                              in[21], in[20] >> 10 | in[21] << 22,
+                              in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 25;
+
+  return in;
+}
+
+inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3ffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
+                               0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
+                             in[4], in[3] >> 8 | in[4] << 24,
+                             in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
+                             in[0] >> 26 | in[1] << 6, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
+                                0, 4, 0, 0);
+  reg_inls = _mm256_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
+                              in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
+                              in[8] >> 30 | in[9] << 2, in[8],
+                              in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
+                              in[17], in[16] >> 8 | in[17] << 24,
+                              in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
+                              in[13] >> 26 | in[14] << 6, in[13]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
+                                0, 4, 0, 0);
+  reg_inls = _mm256_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
+                              in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
+                              in[21] >> 30 | in[22] << 2, in[21],
+                              in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 26;
+
+  return in;
+}
+
+inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7ffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
+                               0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[5] >> 29 | in[6] << 3, in[5],
+                             in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
+                             in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
+                             in[0] >> 27 | in[1] << 5, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
+                              in[10] >> 31 | in[11] << 1, in[10],
+                              in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
+                              in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                                1, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
+                              in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
+                              in[16], in[15] >> 6 | in[16] << 26,
+                              in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(5, 0, 0, 0,
+                                0, 0, 3, 0);
+  reg_inls = _mm256_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
+                              in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
+                              in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
+                              in[21], in[20] >> 8 | in[21] << 24);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 27;
+
+  return in;
+}
+
+inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0xfffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
+                               0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[6], in[5] >> 8 | in[6] << 24,
+                             in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
+                             in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
+                             in[0] >> 28 | in[1] << 4, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
+                              in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
+                              in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
+                              in[7] >> 28 | in[8] << 4, in[7]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[20], in[19] >> 8 | in[20] << 24,
+                              in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
+                              in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
+                              in[14] >> 28 | in[15] << 4, in[14]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
+                              in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
+                              in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
+                              in[21] >> 28 | in[22] << 4, in[21]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 28;
+
+  return in;
+}
+
+inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x1fffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                               0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
+                             in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
+                             in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
+                             in[0] >> 29 | in[1] << 3, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                                0, 2, 0, 0);
+  reg_inls = _mm256_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
+                              in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
+                              in[9] >> 31 | in[10] << 1, in[9],
+                              in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 1, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
+                              in[19], in[18] >> 4 | in[19] << 28,
+                              in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
+                              in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(3, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
+                              in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
+                              in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
+                              in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 29;
+
+  return in;
+}
+
+inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x3fffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                               0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
+                             in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
+                             in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
+                             in[0] >> 30 | in[1] << 2, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
+                              in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
+                              in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
+                              in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
+                              in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
+                              in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
+                              in[15] >> 30 | in[16] << 2, in[15]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
+                              in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
+                              in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
+                              in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 30;
+
+  return in;
+}
+
+inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out) {
+  uint32_t mask = 0x7fffffff;
+  __m256i reg_shifts, reg_inls, reg_masks;
+  __m256i results;
+
+  reg_masks = _mm256_set1_epi32(mask);
+
+  // shift the first 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                               0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
+                             in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
+                             in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
+                             in[0] >> 31 | in[1] << 1, in[0]);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the second 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
+                              in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
+                              in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
+                              in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the third 8 outs
+  reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
+                              in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
+                              in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
+                              in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  // shift the last 8 outs
+  reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
+                                0, 0, 0, 0);
+  reg_inls = _mm256_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
+                              in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
+                              in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
+                              in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24);
+  results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
+  _mm256_storeu_si256((__m256i*)(out), results);
+  out += 8;
+
+  in += 31;
+
+  return in;
+}
+
+inline const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) {
+  memcpy(out, in, 32 * sizeof(*out));
+  in += 32;
+  out += 32;
+
+  return in;
+}
+
+int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
+  batch_size = batch_size / 32 * 32;
+  int num_loops = batch_size / 32;
+
+  switch (num_bits) {
+    case 0:
+      for (int i = 0; i < num_loops; ++i) in = unpack0_32_avx2(in, out + i * 32);
+      break;
+    case 1:
+      for (int i = 0; i < num_loops; ++i) in = unpack1_32_avx2(in, out + i * 32);
+      break;
+    case 2:
+      for (int i = 0; i < num_loops; ++i) in = unpack2_32_avx2(in, out + i * 32);
+      break;
+    case 3:
+      for (int i = 0; i < num_loops; ++i) in = unpack3_32_avx2(in, out + i * 32);
+      break;
+    case 4:
+      for (int i = 0; i < num_loops; ++i) in = unpack4_32_avx2(in, out + i * 32);
+      break;
+    case 5:
+      for (int i = 0; i < num_loops; ++i) in = unpack5_32_avx2(in, out + i * 32);
+      break;
+    case 6:
+      for (int i = 0; i < num_loops; ++i) in = unpack6_32_avx2(in, out + i * 32);
+      break;
+    case 7:
+      for (int i = 0; i < num_loops; ++i) in = unpack7_32_avx2(in, out + i * 32);
+      break;
+    case 8:
+      for (int i = 0; i < num_loops; ++i) in = unpack8_32_avx2(in, out + i * 32);
+      break;
+    case 9:
+      for (int i = 0; i < num_loops; ++i) in = unpack9_32_avx2(in, out + i * 32);
+      break;
+    case 10:
+      for (int i = 0; i < num_loops; ++i) in = unpack10_32_avx2(in, out + i * 32);
+      break;
+    case 11:
+      for (int i = 0; i < num_loops; ++i) in = unpack11_32_avx2(in, out + i * 32);
+      break;
+    case 12:
+      for (int i = 0; i < num_loops; ++i) in = unpack12_32_avx2(in, out + i * 32);
+      break;
+    case 13:
+      for (int i = 0; i < num_loops; ++i) in = unpack13_32_avx2(in, out + i * 32);
+      break;
+    case 14:
+      for (int i = 0; i < num_loops; ++i) in = unpack14_32_avx2(in, out + i * 32);
+      break;
+    case 15:
+      for (int i = 0; i < num_loops; ++i) in = unpack15_32_avx2(in, out + i * 32);
+      break;
+    case 16:
+      for (int i = 0; i < num_loops; ++i) in = unpack16_32_avx2(in, out + i * 32);
+      break;
+    case 17:
+      for (int i = 0; i < num_loops; ++i) in = unpack17_32_avx2(in, out + i * 32);
+      break;
+    case 18:
+      for (int i = 0; i < num_loops; ++i) in = unpack18_32_avx2(in, out + i * 32);
+      break;
+    case 19:
+      for (int i = 0; i < num_loops; ++i) in = unpack19_32_avx2(in, out + i * 32);
+      break;
+    case 20:
+      for (int i = 0; i < num_loops; ++i) in = unpack20_32_avx2(in, out + i * 32);
+      break;
+    case 21:
+      for (int i = 0; i < num_loops; ++i) in = unpack21_32_avx2(in, out + i * 32);
+      break;
+    case 22:
+      for (int i = 0; i < num_loops; ++i) in = unpack22_32_avx2(in, out + i * 32);
+      break;
+    case 23:
+      for (int i = 0; i < num_loops; ++i) in = unpack23_32_avx2(in, out + i * 32);
+      break;
+    case 24:
+      for (int i = 0; i < num_loops; ++i) in = unpack24_32_avx2(in, out + i * 32);
+      break;
+    case 25:
+      for (int i = 0; i < num_loops; ++i) in = unpack25_32_avx2(in, out + i * 32);
+      break;
+    case 26:
+      for (int i = 0; i < num_loops; ++i) in = unpack26_32_avx2(in, out + i * 32);
+      break;
+    case 27:
+      for (int i = 0; i < num_loops; ++i) in = unpack27_32_avx2(in, out + i * 32);
+      break;
+    case 28:
+      for (int i = 0; i < num_loops; ++i) in = unpack28_32_avx2(in, out + i * 32);
+      break;
+    case 29:
+      for (int i = 0; i < num_loops; ++i) in = unpack29_32_avx2(in, out + i * 32);
+      break;
+    case 30:
+      for (int i = 0; i < num_loops; ++i) in = unpack30_32_avx2(in, out + i * 32);
+      break;
+    case 31:
+      for (int i = 0; i < num_loops; ++i) in = unpack31_32_avx2(in, out + i * 32);
+      break;
+    case 32:
+      for (int i = 0; i < num_loops; ++i) in = unpack32_32_avx2(in, out + i * 32);
+      break;
+  }
+
+  return batch_size;
+}
diff --git a/go/parquet/internal/utils/_lib/bit_packing_avx2.s b/go/parquet/internal/utils/_lib/bit_packing_avx2.s
new file mode 100644
index 00000000000..222bc3ce413
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/bit_packing_avx2.s
@@ -0,0 +1,4012 @@
+	.text
+	.intel_syntax noprefix
+	.file	"bit_packing_avx2.c"
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3                               # -- Begin function unpack32_avx2
+.LCPI0_0:
+	.quad	9223372034707292159             # 0x7fffffff7fffffff
+.LCPI0_8:
+	.quad	4611686015206162431             # 0x3fffffff3fffffff
+.LCPI0_12:
+	.quad	2305843005455597567             # 0x1fffffff1fffffff
+.LCPI0_23:
+	.quad	1152921500580315135             # 0xfffffff0fffffff
+.LCPI0_25:
+	.quad	576460748142673919              # 0x7ffffff07ffffff
+.LCPI0_34:
+	.quad	288230371923853311              # 0x3ffffff03ffffff
+.LCPI0_35:
+	.quad	42949672976                     # 0xa00000010
+.LCPI0_36:
+	.quad	94489280528                     # 0x1600000010
+.LCPI0_38:
+	.quad	144115183814443007              # 0x1ffffff01ffffff
+.LCPI0_49:
+	.quad	36028792732385279               # 0x7fffff007fffff
+.LCPI0_56:
+	.quad	18014394218708991               # 0x3fffff003fffff
+.LCPI0_59:
+	.quad	9007194961870847                # 0x1fffff001fffff
+.LCPI0_66:
+	.quad	4503595333451775                # 0xfffff000fffff
+.LCPI0_68:
+	.quad	2251795519242239                # 0x7ffff0007ffff
+.LCPI0_73:
+	.quad	1125895612137471                # 0x3ffff0003ffff
+.LCPI0_76:
+	.quad	562945658585087                 # 0x1ffff0001ffff
+.LCPI0_80:
+	.quad	68719476736                     # 0x1000000000
+.LCPI0_82:
+	.quad	140733193420799                 # 0x7fff00007fff
+.LCPI0_87:
+	.quad	70364449226751                  # 0x3fff00003fff
+.LCPI0_90:
+	.quad	35180077129727                  # 0x1fff00001fff
+.LCPI0_95:
+	.quad	17587891081215                  # 0xfff00000fff
+.LCPI0_97:
+	.quad	8791798056959                   # 0x7ff000007ff
+.LCPI0_102:
+	.quad	4393751544831                   # 0x3ff000003ff
+.LCPI0_105:
+	.quad	2194728288767                   # 0x1ff000001ff
+.LCPI0_112:
+	.quad	545460846719                    # 0x7f0000007f
+.LCPI0_117:
+	.quad	270582939711                    # 0x3f0000003f
+.LCPI0_120:
+	.quad	133143986207                    # 0x1f0000001f
+.LCPI0_125:
+	.quad	64424509455                     # 0xf0000000f
+.LCPI0_127:
+	.quad	30064771079                     # 0x700000007
+.LCPI0_132:
+	.quad	12884901891                     # 0x300000003
+.LCPI0_135:
+	.quad	4294967297                      # 0x100000001
+	.section	.rodata.cst32,"aM",@progbits,32
+	.p2align	5
+.LCPI0_1:
+	.long	24                              # 0x18
+	.long	23                              # 0x17
+	.long	22                              # 0x16
+	.long	21                              # 0x15
+	.long	20                              # 0x14
+	.long	19                              # 0x13
+	.long	18                              # 0x12
+	.long	17                              # 0x11
+.LCPI0_2:
+	.long	8                               # 0x8
+	.long	9                               # 0x9
+	.long	10                              # 0xa
+	.long	11                              # 0xb
+	.long	12                              # 0xc
+	.long	13                              # 0xd
+	.long	14                              # 0xe
+	.long	15                              # 0xf
+.LCPI0_3:
+	.long	16                              # 0x10
+	.long	15                              # 0xf
+	.long	14                              # 0xe
+	.long	13                              # 0xd
+	.long	12                              # 0xc
+	.long	11                              # 0xb
+	.long	10                              # 0xa
+	.long	9                               # 0x9
+.LCPI0_4:
+	.long	16                              # 0x10
+	.long	17                              # 0x11
+	.long	18                              # 0x12
+	.long	19                              # 0x13
+	.long	20                              # 0x14
+	.long	21                              # 0x15
+	.long	22                              # 0x16
+	.long	23                              # 0x17
+.LCPI0_7:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+.LCPI0_11:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+.LCPI0_15:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_18:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_21:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+.LCPI0_22:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+.LCPI0_24:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+.LCPI0_28:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_31:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_32:
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+.LCPI0_33:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_37:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+.LCPI0_39:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_42:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+.LCPI0_45:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+.LCPI0_48:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+.LCPI0_52:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+.LCPI0_53:
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_54:
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+.LCPI0_55:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+.LCPI0_57:
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+.LCPI0_58:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+.LCPI0_60:
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+.LCPI0_61:
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+.LCPI0_64:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	0                               # 0x0
+	.long	11                              # 0xb
+.LCPI0_65:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+.LCPI0_67:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+.LCPI0_69:
+	.long	0                               # 0x0
+	.long	11                              # 0xb
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	0                               # 0x0
+.LCPI0_70:
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+.LCPI0_71:
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+	.long	0                               # 0x0
+	.long	13                              # 0xd
+.LCPI0_72:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+.LCPI0_74:
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	0                               # 0x0
+	.long	14                              # 0xe
+.LCPI0_75:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+.LCPI0_77:
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+	.long	14                              # 0xe
+	.long	0                               # 0x0
+.LCPI0_78:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+.LCPI0_79:
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+	.long	0                               # 0x0
+	.long	11                              # 0xb
+	.long	0                               # 0x0
+	.long	13                              # 0xd
+	.long	0                               # 0x0
+	.long	15                              # 0xf
+.LCPI0_81:
+	.long	0                               # 0x0
+	.long	15                              # 0xf
+	.long	0                               # 0x0
+	.long	13                              # 0xd
+	.long	0                               # 0x0
+	.long	11                              # 0xb
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+.LCPI0_83:
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+.LCPI0_84:
+	.long	16                              # 0x10
+	.long	0                               # 0x0
+	.long	14                              # 0xe
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	0                               # 0x0
+.LCPI0_85:
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	17                              # 0x11
+.LCPI0_86:
+	.long	0                               # 0x0
+	.long	14                              # 0xe
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+.LCPI0_88:
+	.long	16                              # 0x10
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	18                              # 0x12
+.LCPI0_89:
+	.long	0                               # 0x0
+	.long	13                              # 0xd
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	14                              # 0xe
+	.long	0                               # 0x0
+.LCPI0_91:
+	.long	8                               # 0x8
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	15                              # 0xf
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+.LCPI0_92:
+	.long	16                              # 0x10
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	17                              # 0x11
+	.long	0                               # 0x0
+	.long	11                              # 0xb
+.LCPI0_93:
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	18                              # 0x12
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	19                              # 0x13
+.LCPI0_94:
+	.long	0                               # 0x0
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	16                              # 0x10
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+	.long	20                              # 0x14
+.LCPI0_96:
+	.long	0                               # 0x0
+	.long	11                              # 0xb
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	12                              # 0xc
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	13                              # 0xd
+.LCPI0_98:
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	14                              # 0xe
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	15                              # 0xf
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+.LCPI0_99:
+	.long	16                              # 0x10
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	17                              # 0x11
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+	.long	18                              # 0x12
+	.long	0                               # 0x0
+.LCPI0_100:
+	.long	8                               # 0x8
+	.long	19                              # 0x13
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+	.long	20                              # 0x14
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	21                              # 0x15
+.LCPI0_101:
+	.long	0                               # 0x0
+	.long	10                              # 0xa
+	.long	20                              # 0x14
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+	.long	18                              # 0x12
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+.LCPI0_103:
+	.long	16                              # 0x10
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	14                              # 0xe
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	12                              # 0xc
+	.long	22                              # 0x16
+.LCPI0_104:
+	.long	0                               # 0x0
+	.long	9                               # 0x9
+	.long	18                              # 0x12
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	13                              # 0xd
+	.long	22                              # 0x16
+	.long	0                               # 0x0
+.LCPI0_106:
+	.long	8                               # 0x8
+	.long	17                              # 0x11
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	12                              # 0xc
+	.long	21                              # 0x15
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+.LCPI0_107:
+	.long	16                              # 0x10
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	11                              # 0xb
+	.long	20                              # 0x14
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	15                              # 0xf
+.LCPI0_108:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	10                              # 0xa
+	.long	19                              # 0x13
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	14                              # 0xe
+	.long	23                              # 0x17
+.LCPI0_111:
+	.long	0                               # 0x0
+	.long	7                               # 0x7
+	.long	14                              # 0xe
+	.long	21                              # 0x15
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	10                              # 0xa
+	.long	17                              # 0x11
+.LCPI0_113:
+	.long	24                              # 0x18
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	13                              # 0xd
+	.long	20                              # 0x14
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	9                               # 0x9
+.LCPI0_114:
+	.long	16                              # 0x10
+	.long	23                              # 0x17
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	12                              # 0xc
+	.long	19                              # 0x13
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+.LCPI0_115:
+	.long	8                               # 0x8
+	.long	15                              # 0xf
+	.long	22                              # 0x16
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	11                              # 0xb
+	.long	18                              # 0x12
+	.long	25                              # 0x19
+.LCPI0_116:
+	.long	0                               # 0x0
+	.long	6                               # 0x6
+	.long	12                              # 0xc
+	.long	18                              # 0x12
+	.long	24                              # 0x18
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	10                              # 0xa
+.LCPI0_118:
+	.long	16                              # 0x10
+	.long	22                              # 0x16
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	8                               # 0x8
+	.long	14                              # 0xe
+	.long	20                              # 0x14
+	.long	26                              # 0x1a
+.LCPI0_119:
+	.long	0                               # 0x0
+	.long	5                               # 0x5
+	.long	10                              # 0xa
+	.long	15                              # 0xf
+	.long	20                              # 0x14
+	.long	25                              # 0x19
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+.LCPI0_121:
+	.long	8                               # 0x8
+	.long	13                              # 0xd
+	.long	18                              # 0x12
+	.long	23                              # 0x17
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	6                               # 0x6
+	.long	11                              # 0xb
+.LCPI0_122:
+	.long	16                              # 0x10
+	.long	21                              # 0x15
+	.long	26                              # 0x1a
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	9                               # 0x9
+	.long	14                              # 0xe
+	.long	19                              # 0x13
+.LCPI0_123:
+	.long	24                              # 0x18
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	7                               # 0x7
+	.long	12                              # 0xc
+	.long	17                              # 0x11
+	.long	22                              # 0x16
+	.long	27                              # 0x1b
+.LCPI0_124:
+	.long	0                               # 0x0
+	.long	4                               # 0x4
+	.long	8                               # 0x8
+	.long	12                              # 0xc
+	.long	16                              # 0x10
+	.long	20                              # 0x14
+	.long	24                              # 0x18
+	.long	28                              # 0x1c
+.LCPI0_126:
+	.long	0                               # 0x0
+	.long	3                               # 0x3
+	.long	6                               # 0x6
+	.long	9                               # 0x9
+	.long	12                              # 0xc
+	.long	15                              # 0xf
+	.long	18                              # 0x12
+	.long	21                              # 0x15
+.LCPI0_128:
+	.long	24                              # 0x18
+	.long	27                              # 0x1b
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	4                               # 0x4
+	.long	7                               # 0x7
+	.long	10                              # 0xa
+	.long	13                              # 0xd
+.LCPI0_129:
+	.long	16                              # 0x10
+	.long	19                              # 0x13
+	.long	22                              # 0x16
+	.long	25                              # 0x19
+	.long	28                              # 0x1c
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	5                               # 0x5
+.LCPI0_130:
+	.long	8                               # 0x8
+	.long	11                              # 0xb
+	.long	14                              # 0xe
+	.long	17                              # 0x11
+	.long	20                              # 0x14
+	.long	23                              # 0x17
+	.long	26                              # 0x1a
+	.long	29                              # 0x1d
+.LCPI0_131:
+	.long	0                               # 0x0
+	.long	2                               # 0x2
+	.long	4                               # 0x4
+	.long	6                               # 0x6
+	.long	8                               # 0x8
+	.long	10                              # 0xa
+	.long	12                              # 0xc
+	.long	14                              # 0xe
+.LCPI0_133:
+	.long	16                              # 0x10
+	.long	18                              # 0x12
+	.long	20                              # 0x14
+	.long	22                              # 0x16
+	.long	24                              # 0x18
+	.long	26                              # 0x1a
+	.long	28                              # 0x1c
+	.long	30                              # 0x1e
+.LCPI0_134:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	2                               # 0x2
+	.long	3                               # 0x3
+	.long	4                               # 0x4
+	.long	5                               # 0x5
+	.long	6                               # 0x6
+	.long	7                               # 0x7
+.LCPI0_136:
+	.long	24                              # 0x18
+	.long	25                              # 0x19
+	.long	26                              # 0x1a
+	.long	27                              # 0x1b
+	.long	28                              # 0x1c
+	.long	29                              # 0x1d
+	.long	30                              # 0x1e
+	.long	31                              # 0x1f
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4
+.LCPI0_5:
+	.long	8                               # 0x8
+	.long	7                               # 0x7
+	.long	6                               # 0x6
+	.long	5                               # 0x5
+.LCPI0_6:
+	.long	24                              # 0x18
+	.long	25                              # 0x19
+	.long	26                              # 0x1a
+	.long	27                              # 0x1b
+.LCPI0_9:
+	.long	16                              # 0x10
+	.long	14                              # 0xe
+	.long	12                              # 0xc
+	.long	10                              # 0xa
+.LCPI0_10:
+	.long	16                              # 0x10
+	.long	18                              # 0x12
+	.long	20                              # 0x14
+	.long	22                              # 0x16
+.LCPI0_13:
+	.long	8                               # 0x8
+	.long	5                               # 0x5
+	.zero	4
+	.zero	4
+.LCPI0_14:
+	.long	24                              # 0x18
+	.long	27                              # 0x1b
+	.zero	4
+	.zero	4
+.LCPI0_16:
+	.long	16                              # 0x10
+	.long	13                              # 0xd
+	.long	10                              # 0xa
+	.long	7                               # 0x7
+.LCPI0_17:
+	.long	16                              # 0x10
+	.long	19                              # 0x13
+	.long	22                              # 0x16
+	.long	25                              # 0x19
+.LCPI0_19:
+	.long	24                              # 0x18
+	.long	21                              # 0x15
+	.long	18                              # 0x12
+	.long	15                              # 0xf
+.LCPI0_20:
+	.long	8                               # 0x8
+	.long	11                              # 0xb
+	.long	14                              # 0xe
+	.long	17                              # 0x11
+.LCPI0_26:
+	.long	24                              # 0x18
+	.long	19                              # 0x13
+	.long	14                              # 0xe
+	.long	9                               # 0x9
+.LCPI0_27:
+	.long	8                               # 0x8
+	.long	13                              # 0xd
+	.long	18                              # 0x12
+	.long	23                              # 0x17
+.LCPI0_29:
+	.long	16                              # 0x10
+	.long	11                              # 0xb
+	.zero	4
+	.zero	4
+.LCPI0_30:
+	.long	16                              # 0x10
+	.long	21                              # 0x15
+	.zero	4
+	.zero	4
+.LCPI0_40:
+	.long	16                              # 0x10
+	.long	9                               # 0x9
+	.zero	4
+	.zero	4
+.LCPI0_41:
+	.long	16                              # 0x10
+	.long	23                              # 0x17
+	.zero	4
+	.zero	4
+.LCPI0_43:
+	.long	24                              # 0x18
+	.long	17                              # 0x11
+	.zero	4
+	.zero	4
+.LCPI0_44:
+	.long	8                               # 0x8
+	.long	15                              # 0xf
+	.zero	4
+	.zero	4
+.LCPI0_46:
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+.LCPI0_50:
+	.long	24                              # 0x18
+	.long	15                              # 0xf
+	.zero	4
+	.zero	4
+.LCPI0_51:
+	.long	8                               # 0x8
+	.long	17                              # 0x11
+	.zero	4
+	.zero	4
+.LCPI0_62:
+	.long	24                              # 0x18
+	.long	13                              # 0xd
+	.zero	4
+	.zero	4
+.LCPI0_63:
+	.long	8                               # 0x8
+	.long	19                              # 0x13
+	.zero	4
+	.zero	4
+.LCPI0_109:
+	.long	0                               # 0x0
+	.long	8                               # 0x8
+	.long	16                              # 0x10
+	.long	24                              # 0x18
+	.section	.rodata.cst4,"aM",@progbits,4
+	.p2align	2
+.LCPI0_47:
+	.long	16777215                        # 0xffffff
+.LCPI0_110:
+	.long	255                             # 0xff
+	.text
+	.globl	unpack32_avx2
+	.p2align	4, 0x90
+	.type	unpack32_avx2,@function
+unpack32_avx2:                          # @unpack32_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	r15
+	push	r14
+	push	r12
+	push	rbx
+	and	rsp, -16
+                                        # kill: def $edx killed $edx def $rdx
+	mov	r15, rsi
+	mov	rbx, rdi
+	lea	r14d, [rdx + 31]
+	test	edx, edx
+	cmovns	r14d, edx
+	sar	r14d, 5
+	cmp	ecx, 15
+	jle	.LBB0_1
+# %bb.48:
+	cmp	ecx, 23
+	jle	.LBB0_49
+# %bb.72:
+	cmp	ecx, 27
+	jle	.LBB0_73
+# %bb.84:
+	cmp	ecx, 29
+	jle	.LBB0_85
+# %bb.90:
+	cmp	ecx, 30
+	je	.LBB0_99
+# %bb.91:
+	cmp	ecx, 31
+	je	.LBB0_96
+# %bb.92:
+	cmp	ecx, 32
+	jne	.LBB0_147
+# %bb.93:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.94:
+	mov	r12d, r14d
+	.p2align	4, 0x90
+.LBB0_95:                               # =>This Inner Loop Header: Depth=1
+	mov	edx, 128
+	mov	rdi, r15
+	mov	rsi, rbx
+	call	clib·_memcpy(SB)
+	sub	rbx, -128
+	sub	r15, -128
+	add	r12, -1
+	jne	.LBB0_95
+	jmp	.LBB0_147
+.LBB0_1:
+	cmp	ecx, 7
+	jg	.LBB0_25
+# %bb.2:
+	cmp	ecx, 3
+	jg	.LBB0_14
+# %bb.3:
+	cmp	ecx, 1
+	jg	.LBB0_9
+# %bb.4:
+	test	ecx, ecx
+	je	.LBB0_144
+# %bb.5:
+	cmp	ecx, 1
+	jne	.LBB0_147
+# %bb.6:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.7:
+	mov	eax, r14d
+	add	r15, 96
+	xor	ecx, ecx
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_135] # ymm0 = [4294967297,4294967297,4294967297,4294967297]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_134] # ymm1 = [0,1,2,3,4,5,6,7]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_2] # ymm2 = [8,9,10,11,12,13,14,15]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_4] # ymm3 = [16,17,18,19,20,21,22,23]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_136] # ymm4 = [24,25,26,27,28,29,30,31]
+	.p2align	4, 0x90
+.LBB0_8:                                # =>This Inner Loop Header: Depth=1
+	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	add	rcx, 1
+	sub	r15, -128
+	cmp	rax, rcx
+	jne	.LBB0_8
+	jmp	.LBB0_147
+.LBB0_49:
+	cmp	ecx, 19
+	jg	.LBB0_61
+# %bb.50:
+	cmp	ecx, 17
+	jg	.LBB0_56
+# %bb.51:
+	cmp	ecx, 16
+	je	.LBB0_120
+# %bb.52:
+	cmp	ecx, 17
+	jne	.LBB0_147
+# %bb.53:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.54:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 64
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_76] # ymm0 = [562945658585087,562945658585087,562945658585087,562945658585087]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_75] # ymm1 = [0,0,2,0,4,0,6,0]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_77] # ymm2 = [8,0,10,0,12,0,14,0]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_78] # ymm3 = [0,1,0,3,0,5,0,7]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_79] # ymm4 = [0,9,0,11,0,13,0,15]
+	.p2align	4, 0x90
+.LBB0_55:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 52]
+	mov	r10d, dword ptr [rbx - 48]
+	shld	r10d, ecx, 9
+	mov	esi, dword ptr [rbx - 56]
+	mov	edi, ecx
+	shld	edi, esi, 11
+	mov	r9d, dword ptr [rbx - 64]
+	mov	edx, dword ptr [rbx - 60]
+	mov	eax, edx
+	shld	eax, r9d, 15
+	vmovd	xmm5, esi
+	shld	esi, edx, 13
+	vpinsrd	xmm5, xmm5, edi, 1
+	vpinsrd	xmm5, xmm5, ecx, 2
+	vpinsrd	xmm5, xmm5, r10d, 3
+	vmovd	xmm6, r9d
+	vpinsrd	xmm6, xmm6, eax, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, esi, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	eax, dword ptr [rbx - 36]
+	mov	r10d, dword ptr [rbx - 32]
+	shld	r10d, eax, 1
+	mov	edx, dword ptr [rbx - 40]
+	mov	esi, eax
+	shld	esi, edx, 3
+	mov	r9d, dword ptr [rbx - 48]
+	mov	ecx, dword ptr [rbx - 44]
+	mov	edi, ecx
+	shld	edi, r9d, 7
+	vmovd	xmm5, edx
+	shld	edx, ecx, 5
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, r10d, 3
+	vmovd	xmm6, r9d
+	vpinsrd	xmm6, xmm6, edi, 1
+	vpinsrd	xmm6, xmm6, ecx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	r9d, dword ptr [rbx - 16]
+	mov	r11d, dword ptr [rbx - 20]
+	mov	edx, r9d
+	shld	edx, r11d, 10
+	mov	r10d, dword ptr [rbx - 24]
+	mov	edi, r11d
+	shld	edi, r10d, 12
+	mov	eax, dword ptr [rbx - 28]
+	mov	esi, r10d
+	shld	esi, eax, 14
+	mov	ecx, dword ptr [rbx - 32]
+	shrd	ecx, eax, 16
+	vmovd	xmm5, edi
+	vpinsrd	xmm5, xmm5, r11d, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, eax, 1
+	vpinsrd	xmm6, xmm6, esi, 2
+	vpinsrd	xmm6, xmm6, r10d, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	r9d, dword ptr [rbx]
+	mov	r11d, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, r11d, 2
+	mov	r10d, dword ptr [rbx - 8]
+	mov	edi, r11d
+	shld	edi, r10d, 4
+	mov	eax, dword ptr [rbx - 16]
+	mov	esi, dword ptr [rbx - 12]
+	mov	ecx, r10d
+	shld	ecx, esi, 6
+	shrd	eax, esi, 24
+	vmovd	xmm5, edi
+	vpinsrd	xmm5, xmm5, r11d, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vmovd	xmm6, eax
+	vpinsrd	xmm6, xmm6, esi, 1
+	vpinsrd	xmm6, xmm6, ecx, 2
+	vpinsrd	xmm6, xmm6, r10d, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 68
+	add	r8, -1
+	jne	.LBB0_55
+	jmp	.LBB0_147
+.LBB0_25:
+	cmp	ecx, 11
+	jg	.LBB0_37
+# %bb.26:
+	cmp	ecx, 9
+	jg	.LBB0_32
+# %bb.27:
+	cmp	ecx, 8
+	je	.LBB0_132
+# %bb.28:
+	cmp	ecx, 9
+	jne	.LBB0_147
+# %bb.29:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.30:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 32
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_105] # ymm0 = [2194728288767,2194728288767,2194728288767,2194728288767]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_104] # ymm1 = [0,9,18,0,4,13,22,0]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_106] # ymm2 = [8,17,0,3,12,21,0,7]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_107] # ymm3 = [16,0,2,11,20,0,6,15]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_108] # ymm4 = [0,1,10,19,0,5,14,23]
+	.p2align	4, 0x90
+.LBB0_31:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 32]
+	mov	edx, dword ptr [rbx - 28]
+	mov	esi, dword ptr [rbx - 24]
+	shld	esi, edx, 1
+	vmovd	xmm5, edx
+	vpinsrd	xmm5, xmm5, edx, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	shld	edx, ecx, 5
+	vpinsrd	xmm5, xmm5, esi, 3
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, ecx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	ecx, dword ptr [rbx - 16]
+	mov	edx, dword ptr [rbx - 24]
+	mov	esi, dword ptr [rbx - 20]
+	mov	edi, ecx
+	shld	edi, esi, 2
+	mov	eax, esi
+	shld	eax, edx, 6
+	vmovd	xmm5, esi
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm5, xmm5, edi, 2
+	vpinsrd	xmm5, xmm5, ecx, 3
+	vmovd	xmm6, edx
+	vpinsrd	xmm6, xmm6, edx, 1
+	vpinsrd	xmm6, xmm6, eax, 2
+	vpinsrd	xmm6, xmm6, esi, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	eax, dword ptr [rbx - 8]
+	mov	ecx, dword ptr [rbx - 16]
+	mov	edx, dword ptr [rbx - 12]
+	mov	esi, eax
+	shld	esi, edx, 3
+	mov	edi, edx
+	shld	edi, ecx, 7
+	vmovd	xmm5, edx
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, edi, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	eax, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 8]
+	mov	edx, dword ptr [rbx - 4]
+	mov	esi, eax
+	shld	esi, edx, 4
+	shrd	ecx, edx, 24
+	vmovd	xmm5, esi
+	vpinsrd	xmm5, xmm5, eax, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, edx, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 36
+	add	r8, -1
+	jne	.LBB0_31
+	jmp	.LBB0_147
+.LBB0_73:
+	cmp	ecx, 25
+	jg	.LBB0_79
+# %bb.74:
+	cmp	ecx, 24
+	je	.LBB0_108
+# %bb.75:
+	cmp	ecx, 25
+	jne	.LBB0_147
+# %bb.76:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.77:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 96
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_38] # ymm0 = [144115183814443007,144115183814443007,144115183814443007,144115183814443007]
+	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_28] # ymm9 = [0,0,0,0,4,0,0,0]
+	vmovdqa	ymm10, ymmword ptr [rip + .LCPI0_39] # ymm10 = [0,1,0,0,0,5,0,0]
+	vmovdqa	xmm11, xmmword ptr [rip + .LCPI0_40] # xmm11 = <16,9,u,u>
+	vmovdqa	xmm4, xmmword ptr [rip + .LCPI0_41] # xmm4 = <16,23,u,u>
+	vmovdqa	ymm5, ymmword ptr [rip + .LCPI0_42] # ymm5 = [0,0,2,0,0,0,6,0]
+	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_43] # xmm6 = <24,17,u,u>
+	vmovdqa	xmm7, xmmword ptr [rip + .LCPI0_44] # xmm7 = <8,15,u,u>
+	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_45] # ymm8 = [0,0,0,3,0,0,0,7]
+	.p2align	4, 0x90
+.LBB0_78:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 76]
+	mov	r9d, dword ptr [rbx - 72]
+	shld	r9d, ecx, 17
+	mov	esi, dword ptr [rbx - 80]
+	shld	ecx, esi, 10
+	mov	edi, dword ptr [rbx - 84]
+	shld	esi, edi, 3
+	mov	eax, dword ptr [rbx - 88]
+	vmovd	xmm1, edi
+	shld	edi, eax, 21
+	mov	r10d, dword ptr [rbx - 96]
+	mov	edx, dword ptr [rbx - 92]
+	shld	eax, edx, 14
+	shld	edx, r10d, 7
+	vpinsrd	xmm1, xmm1, esi, 1
+	vmovd	xmm2, r10d
+	vpinsrd	xmm1, xmm1, ecx, 2
+	vpinsrd	xmm2, xmm2, edx, 1
+	vpinsrd	xmm1, xmm1, r9d, 3
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm2, xmm2, edi, 3
+	vinserti128	ymm1, ymm2, xmm1, 1
+	vpsrlvd	ymm1, ymm1, ymm9
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm1
+	mov	r11d, dword ptr [rbx - 52]
+	mov	r9d, dword ptr [rbx - 48]
+	shld	r9d, r11d, 9
+	mov	r10d, dword ptr [rbx - 56]
+	shld	r11d, r10d, 2
+	mov	esi, dword ptr [rbx - 60]
+	mov	edi, r10d
+	mov	ecx, dword ptr [rbx - 64]
+	shld	edi, esi, 20
+	mov	edx, dword ptr [rbx - 72]
+	mov	eax, dword ptr [rbx - 68]
+	shld	esi, ecx, 13
+	shrd	edx, eax, 8
+	shld	ecx, eax, 6
+	vmovd	xmm1, edi
+	vpinsrd	xmm1, xmm1, r10d, 1
+	vmovd	xmm2, edx
+	vpinsrd	xmm1, xmm1, r11d, 2
+	vpinsrd	xmm2, xmm2, eax, 1
+	vpinsrd	xmm1, xmm1, r9d, 3
+	vpinsrd	xmm2, xmm2, ecx, 2
+	vpinsrd	xmm2, xmm2, esi, 3
+	vinserti128	ymm1, ymm2, xmm1, 1
+	vpsrlvd	ymm1, ymm1, ymm10
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm1
+	mov	eax, dword ptr [rbx - 28]
+	mov	r9d, dword ptr [rbx - 24]
+	shld	r9d, eax, 1
+	mov	edx, dword ptr [rbx - 32]
+	mov	esi, eax
+	shld	esi, edx, 19
+	mov	edi, dword ptr [rbx - 40]
+	mov	ecx, dword ptr [rbx - 36]
+	shld	edx, ecx, 12
+	shld	ecx, edi, 5
+	vmovq	xmm1, qword ptr [rbx - 48]      # xmm1 = mem[0],zero
+	vpsrlvd	xmm2, xmm1, xmm11
+	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
+	vpinsrd	xmm1, xmm1, edi, 1
+	vpsllvd	xmm1, xmm1, xmm4
+	vpor	xmm1, xmm2, xmm1
+	vmovd	xmm2, edx
+	vpinsrd	xmm2, xmm2, esi, 1
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vpinsrd	xmm1, xmm1, edi, 2
+	vpinsrd	xmm1, xmm1, ecx, 3
+	vinserti128	ymm1, ymm1, xmm2, 1
+	vpsrlvd	ymm1, ymm1, ymm5
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm1
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, ecx, 18
+	mov	esi, dword ptr [rbx - 8]
+	shld	ecx, esi, 11
+	mov	r10d, dword ptr [rbx - 16]
+	mov	edi, dword ptr [rbx - 12]
+	shld	esi, edi, 4
+	mov	eax, edi
+	shld	eax, r10d, 22
+	vmovq	xmm1, qword ptr [rbx - 24]      # xmm1 = mem[0],zero
+	vpsrlvd	xmm2, xmm1, xmm6
+	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
+	vpinsrd	xmm1, xmm1, r10d, 1
+	vpsllvd	xmm1, xmm1, xmm7
+	vmovd	xmm3, esi
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpor	xmm1, xmm2, xmm1
+	vpinsrd	xmm2, xmm3, edx, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vpinsrd	xmm1, xmm1, eax, 2
+	vpinsrd	xmm1, xmm1, edi, 3
+	vinserti128	ymm1, ymm1, xmm2, 1
+	vpsrlvd	ymm1, ymm1, ymm8
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15], ymm1
+	sub	r15, -128
+	add	rbx, 100
+	add	r8, -1
+	jne	.LBB0_78
+	jmp	.LBB0_147
+.LBB0_14:
+	cmp	ecx, 5
+	jg	.LBB0_20
+# %bb.15:
+	cmp	ecx, 4
+	je	.LBB0_138
+# %bb.16:
+	cmp	ecx, 5
+	jne	.LBB0_147
+# %bb.17:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.18:
+	mov	eax, r14d
+	add	r15, 96
+	add	rbx, 16
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_120] # ymm0 = [133143986207,133143986207,133143986207,133143986207]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_119] # ymm1 = [0,5,10,15,20,25,0,3]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_121] # ymm2 = [8,13,18,23,0,1,6,11]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_122] # ymm3 = [16,21,26,0,4,9,14,19]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_123] # ymm4 = [24,0,2,7,12,17,22,27]
+	.p2align	4, 0x90
+.LBB0_19:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 16]
+	mov	edx, dword ptr [rbx - 12]
+	mov	esi, edx
+	shld	esi, ecx, 2
+	vmovd	xmm5, ecx
+	vpbroadcastd	xmm6, xmm5
+	vpinsrd	xmm5, xmm5, ecx, 1
+	vpinsrd	xmm5, xmm5, esi, 2
+	vpinsrd	xmm5, xmm5, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	ecx, dword ptr [rbx - 12]
+	mov	edx, dword ptr [rbx - 8]
+	mov	esi, edx
+	shld	esi, ecx, 4
+	vmovd	xmm5, ecx
+	vpbroadcastd	xmm5, xmm5
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, edx, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	ecx, dword ptr [rbx - 8]
+	mov	edx, dword ptr [rbx - 4]
+	vmovd	xmm5, edx
+	shld	edx, ecx, 1
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, ecx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vpbroadcastd	xmm5, xmm5
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, dword ptr [rbx]
+	mov	esi, edx
+	shld	esi, ecx, 3
+	vmovd	xmm5, ecx
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, edx, 3
+	vmovd	xmm6, edx
+	vpbroadcastd	xmm6, xmm6
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 20
+	add	rax, -1
+	jne	.LBB0_19
+	jmp	.LBB0_147
+.LBB0_61:
+	cmp	ecx, 21
+	jg	.LBB0_67
+# %bb.62:
+	cmp	ecx, 20
+	je	.LBB0_114
+# %bb.63:
+	cmp	ecx, 21
+	jne	.LBB0_147
+# %bb.64:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.65:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 80
+	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_58] # ymm8 = [0,0,10,0,0,9,0,0]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_59] # ymm1 = [9007194961870847,9007194961870847,9007194961870847,9007194961870847]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_60] # ymm2 = [8,0,0,7,0,0,6,0]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_61] # ymm3 = [0,5,0,0,4,0,0,3]
+	vmovdqa	xmm4, xmmword ptr [rip + .LCPI0_62] # xmm4 = <24,13,u,u>
+	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_63] # xmm5 = <8,19,u,u>
+	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_64] # ymm6 = [0,0,2,0,0,1,0,11]
+	.p2align	4, 0x90
+.LBB0_66:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 64]
+	mov	r9d, dword ptr [rbx - 60]
+	shld	r9d, ecx, 13
+	mov	r11d, dword ptr [rbx - 68]
+	shld	ecx, r11d, 2
+	mov	edi, dword ptr [rbx - 72]
+	mov	esi, r11d
+	shld	esi, edi, 12
+	mov	r10d, dword ptr [rbx - 80]
+	mov	eax, dword ptr [rbx - 76]
+	shld	edi, eax, 1
+	mov	edx, eax
+	shld	edx, r10d, 11
+	vmovd	xmm7, r10d
+	vmovd	xmm0, esi
+	vpinsrd	xmm7, xmm7, edx, 1
+	vpinsrd	xmm0, xmm0, r11d, 1
+	vpinsrd	xmm7, xmm7, eax, 2
+	vpinsrd	xmm0, xmm0, ecx, 2
+	vpinsrd	xmm7, xmm7, edi, 3
+	vpinsrd	xmm0, xmm0, r9d, 3
+	vinserti128	ymm0, ymm7, xmm0, 1
+	vpsrlvd	ymm0, ymm0, ymm8
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm0
+	mov	r10d, dword ptr [rbx - 44]
+	mov	r9d, dword ptr [rbx - 40]
+	shld	r9d, r10d, 5
+	mov	edx, dword ptr [rbx - 48]
+	mov	esi, r10d
+	shld	esi, edx, 15
+	mov	ecx, dword ptr [rbx - 52]
+	shld	edx, ecx, 4
+	mov	r11d, dword ptr [rbx - 60]
+	mov	eax, dword ptr [rbx - 56]
+	mov	edi, ecx
+	shld	edi, eax, 14
+	shld	eax, r11d, 3
+	vmovd	xmm0, r11d
+	vmovd	xmm7, edx
+	vpinsrd	xmm0, xmm0, eax, 1
+	vpinsrd	xmm7, xmm7, esi, 1
+	vpinsrd	xmm0, xmm0, edi, 2
+	vpinsrd	xmm7, xmm7, r10d, 2
+	vpinsrd	xmm0, xmm0, ecx, 3
+	vpinsrd	xmm7, xmm7, r9d, 3
+	vinserti128	ymm0, ymm0, xmm7, 1
+	vpsrlvd	ymm0, ymm0, ymm2
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm0
+	mov	r9d, dword ptr [rbx - 20]
+	mov	ecx, dword ptr [rbx - 24]
+	mov	r10d, r9d
+	shld	r10d, ecx, 18
+	mov	esi, dword ptr [rbx - 28]
+	shld	ecx, esi, 7
+	mov	edi, dword ptr [rbx - 32]
+	vmovd	xmm0, esi
+	shld	esi, edi, 17
+	mov	eax, dword ptr [rbx - 40]
+	mov	edx, dword ptr [rbx - 36]
+	shld	edi, edx, 6
+	shrd	eax, edx, 16
+	vpinsrd	xmm0, xmm0, ecx, 1
+	vmovd	xmm7, eax
+	vpinsrd	xmm0, xmm0, r10d, 2
+	vpinsrd	xmm7, xmm7, edx, 1
+	vpinsrd	xmm0, xmm0, r9d, 3
+	vpinsrd	xmm7, xmm7, edi, 2
+	vpinsrd	xmm7, xmm7, esi, 3
+	vinserti128	ymm0, ymm7, xmm0, 1
+	vpsrlvd	ymm0, ymm0, ymm3
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm0
+	mov	r9d, dword ptr [rbx]
+	mov	eax, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, eax, 10
+	mov	esi, dword ptr [rbx - 12]
+	mov	edi, dword ptr [rbx - 8]
+	mov	ecx, eax
+	shld	ecx, edi, 20
+	shld	edi, esi, 9
+	vmovq	xmm0, qword ptr [rbx - 20]      # xmm0 = mem[0],zero
+	vpsrlvd	xmm7, xmm0, xmm4
+	vpshufd	xmm0, xmm0, 229                 # xmm0 = xmm0[1,1,2,3]
+	vpinsrd	xmm0, xmm0, esi, 1
+	vpsllvd	xmm0, xmm0, xmm5
+	vpor	xmm0, xmm7, xmm0
+	vmovd	xmm7, ecx
+	vpinsrd	xmm7, xmm7, eax, 1
+	vpinsrd	xmm7, xmm7, edx, 2
+	vpinsrd	xmm7, xmm7, r9d, 3
+	vpinsrd	xmm0, xmm0, esi, 2
+	vpinsrd	xmm0, xmm0, edi, 3
+	vinserti128	ymm0, ymm0, xmm7, 1
+	vpsrlvd	ymm0, ymm0, ymm6
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15], ymm0
+	sub	r15, -128
+	add	rbx, 84
+	add	r8, -1
+	jne	.LBB0_66
+	jmp	.LBB0_147
+.LBB0_37:
+	cmp	ecx, 13
+	jg	.LBB0_43
+# %bb.38:
+	cmp	ecx, 12
+	je	.LBB0_126
+# %bb.39:
+	cmp	ecx, 13
+	jne	.LBB0_147
+# %bb.40:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.41:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 48
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_90] # ymm0 = [35180077129727,35180077129727,35180077129727,35180077129727]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_89] # ymm1 = [0,13,0,7,0,1,14,0]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_91] # ymm2 = [8,0,2,15,0,9,0,3]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_92] # ymm3 = [16,0,10,0,4,17,0,11]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_93] # ymm4 = [0,5,18,0,12,0,6,19]
+	.p2align	4, 0x90
+.LBB0_42:                               # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rbx - 40]
+	mov	r9d, dword ptr [rbx - 36]
+	shld	r9d, eax, 5
+	mov	esi, dword ptr [rbx - 48]
+	mov	edx, dword ptr [rbx - 44]
+	mov	ecx, eax
+	shld	ecx, edx, 12
+	mov	edi, edx
+	shld	edi, esi, 6
+	vmovd	xmm5, ecx
+	vpinsrd	xmm5, xmm5, eax, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, esi, 1
+	vpinsrd	xmm6, xmm6, edi, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	r9d, dword ptr [rbx - 24]
+	mov	ecx, dword ptr [rbx - 28]
+	mov	edx, r9d
+	shld	edx, ecx, 10
+	mov	esi, dword ptr [rbx - 32]
+	mov	edi, ecx
+	shld	edi, esi, 4
+	mov	r10d, dword ptr [rbx - 36]
+	mov	eax, esi
+	shld	eax, r10d, 11
+	vmovd	xmm5, edi
+	vpinsrd	xmm5, xmm5, ecx, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vmovd	xmm6, r10d
+	vpinsrd	xmm6, xmm6, eax, 1
+	vpinsrd	xmm6, xmm6, esi, 2
+	vpinsrd	xmm6, xmm6, esi, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	r9d, dword ptr [rbx - 12]
+	mov	ecx, dword ptr [rbx - 16]
+	mov	edx, r9d
+	shld	edx, ecx, 2
+	mov	esi, dword ptr [rbx - 24]
+	mov	eax, dword ptr [rbx - 20]
+	vmovd	xmm5, ecx
+	vpinsrd	xmm5, xmm5, ecx, 1
+	shld	ecx, eax, 9
+	mov	edi, eax
+	shld	edi, esi, 3
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, edi, 1
+	vpinsrd	xmm6, xmm6, eax, 2
+	vpinsrd	xmm6, xmm6, ecx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	eax, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, eax
+	shld	edx, ecx, 7
+	mov	esi, dword ptr [rbx - 8]
+	vmovd	xmm5, ecx
+	shld	ecx, esi, 1
+	mov	edi, dword ptr [rbx - 12]
+	shrd	edi, esi, 24
+	vmovd	xmm6, edi
+	vpinsrd	xmm6, xmm6, esi, 1
+	vpinsrd	xmm6, xmm6, esi, 2
+	vpinsrd	xmm6, xmm6, ecx, 3
+	vpinsrd	xmm5, xmm5, edx, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 52
+	add	r8, -1
+	jne	.LBB0_42
+	jmp	.LBB0_147
+.LBB0_85:
+	cmp	ecx, 28
+	je	.LBB0_102
+# %bb.86:
+	cmp	ecx, 29
+	jne	.LBB0_147
+# %bb.87:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.88:
+	mov	r8d, r14d
+	add	r15, 96
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_12] # ymm0 = [2305843005455597567,2305843005455597567,2305843005455597567,2305843005455597567]
+	vmovdqa	xmm8, xmmword ptr [rip + .LCPI0_13] # xmm8 = <8,5,u,u>
+	vmovdqa	xmm10, xmmword ptr [rip + .LCPI0_14] # xmm10 = <24,27,u,u>
+	vmovdqa	ymm11, ymmword ptr [rip + .LCPI0_15] # ymm11 = [0,0,2,0,0,0,0,0]
+	vmovdqa	xmm12, xmmword ptr [rip + .LCPI0_16] # xmm12 = [16,13,10,7]
+	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_17] # xmm5 = [16,19,22,25]
+	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_18] # ymm6 = [0,0,0,0,0,1,0,0]
+	vmovdqa	xmm7, xmmword ptr [rip + .LCPI0_19] # xmm7 = [24,21,18,15]
+	vmovdqa	xmm1, xmmword ptr [rip + .LCPI0_20] # xmm1 = [8,11,14,17]
+	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_21] # ymm9 = [0,0,0,0,0,0,0,3]
+	.p2align	4, 0x90
+.LBB0_89:                               # =>This Inner Loop Header: Depth=1
+	mov	r11d, dword ptr [rbx + 24]
+	mov	r9d, dword ptr [rbx + 28]
+	shld	r9d, r11d, 21
+	mov	esi, dword ptr [rbx + 20]
+	shld	r11d, esi, 18
+	mov	edi, dword ptr [rbx + 16]
+	shld	esi, edi, 15
+	mov	eax, dword ptr [rbx + 12]
+	shld	edi, eax, 12
+	mov	edx, dword ptr [rbx + 8]
+	shld	eax, edx, 9
+	mov	r10d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx + 4]
+	shld	edx, ecx, 6
+	shld	ecx, r10d, 3
+	vmovd	xmm2, r10d
+	vmovd	xmm3, edi
+	vpinsrd	xmm2, xmm2, ecx, 1
+	vpinsrd	xmm3, xmm3, esi, 1
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm3, xmm3, r11d, 2
+	vpinsrd	xmm2, xmm2, eax, 3
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpand	ymm2, ymm2, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm2
+	mov	eax, dword ptr [rbx + 52]
+	mov	r9d, dword ptr [rbx + 56]
+	shld	r9d, eax, 13
+	mov	edx, dword ptr [rbx + 48]
+	shld	eax, edx, 10
+	mov	esi, dword ptr [rbx + 44]
+	shld	edx, esi, 7
+	mov	edi, dword ptr [rbx + 36]
+	mov	ecx, dword ptr [rbx + 40]
+	shld	esi, ecx, 4
+	shld	ecx, edi, 1
+	vmovq	xmm2, qword ptr [rbx + 28]      # xmm2 = mem[0],zero
+	vpsrlvd	xmm3, xmm2, xmm8
+	vpshufd	xmm2, xmm2, 229                 # xmm2 = xmm2[1,1,2,3]
+	vpinsrd	xmm2, xmm2, edi, 1
+	vpsllvd	xmm2, xmm2, xmm10
+	vpor	xmm2, xmm3, xmm2
+	vmovd	xmm3, esi
+	vpinsrd	xmm3, xmm3, edx, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vpinsrd	xmm2, xmm2, edi, 2
+	vpinsrd	xmm2, xmm2, ecx, 3
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpsrlvd	ymm2, ymm2, ymm11
+	vpand	ymm2, ymm2, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm2
+	mov	eax, dword ptr [rbx + 80]
+	mov	ecx, dword ptr [rbx + 84]
+	shld	ecx, eax, 5
+	mov	edx, dword ptr [rbx + 76]
+	mov	esi, dword ptr [rbx + 72]
+	shld	eax, edx, 2
+	mov	edi, edx
+	shld	edi, esi, 28
+	vmovdqu	xmm2, xmmword ptr [rbx + 56]
+	vpsrlvd	xmm3, xmm2, xmm12
+	vpshufd	xmm2, xmm2, 249                 # xmm2 = xmm2[1,2,3,3]
+	vpinsrd	xmm2, xmm2, esi, 3
+	vmovd	xmm4, edi
+	vpinsrd	xmm4, xmm4, edx, 1
+	vpinsrd	xmm4, xmm4, eax, 2
+	vpsllvd	xmm2, xmm2, xmm5
+	vpinsrd	xmm4, xmm4, ecx, 3
+	vpor	xmm2, xmm3, xmm2
+	vinserti128	ymm2, ymm2, xmm4, 1
+	vpsrlvd	ymm2, ymm2, ymm6
+	vpand	ymm2, ymm2, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm2
+	mov	eax, dword ptr [rbx + 112]
+	mov	ecx, dword ptr [rbx + 108]
+	mov	edx, eax
+	shld	edx, ecx, 26
+	mov	esi, dword ptr [rbx + 104]
+	shld	ecx, esi, 23
+	mov	edi, dword ptr [rbx + 100]
+	vmovdqu	xmm2, xmmword ptr [rbx + 84]
+	shld	esi, edi, 20
+	vpsrlvd	xmm3, xmm2, xmm7
+	vpshufd	xmm2, xmm2, 249                 # xmm2 = xmm2[1,2,3,3]
+	vpinsrd	xmm2, xmm2, edi, 3
+	vmovd	xmm4, esi
+	vpinsrd	xmm4, xmm4, ecx, 1
+	vpsllvd	xmm2, xmm2, xmm1
+	vpinsrd	xmm4, xmm4, edx, 2
+	vpinsrd	xmm4, xmm4, eax, 3
+	vpor	xmm2, xmm3, xmm2
+	vinserti128	ymm2, ymm2, xmm4, 1
+	vpsrlvd	ymm2, ymm2, ymm9
+	vpand	ymm2, ymm2, ymm0
+	vmovdqu	ymmword ptr [r15], ymm2
+	add	rbx, 116
+	sub	r15, -128
+	add	r8, -1
+	jne	.LBB0_89
+	jmp	.LBB0_147
+.LBB0_9:
+	cmp	ecx, 2
+	je	.LBB0_141
+# %bb.10:
+	cmp	ecx, 3
+	jne	.LBB0_147
+# %bb.11:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.12:
+	mov	eax, r14d
+	add	r15, 96
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_127] # ymm0 = [30064771079,30064771079,30064771079,30064771079]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_126] # ymm1 = [0,3,6,9,12,15,18,21]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_128] # ymm2 = [24,27,0,1,4,7,10,13]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_129] # ymm3 = [16,19,22,25,28,0,2,5]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_130] # ymm4 = [8,11,14,17,20,23,26,29]
+	.p2align	4, 0x90
+.LBB0_13:                               # =>This Inner Loop Header: Depth=1
+	vpbroadcastd	ymm5, dword ptr [rbx]
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	ecx, dword ptr [rbx]
+	mov	edx, dword ptr [rbx + 4]
+	mov	esi, edx
+	shld	esi, ecx, 2
+	vmovd	xmm5, ecx
+	vpinsrd	xmm5, xmm5, ecx, 1
+	vpinsrd	xmm5, xmm5, esi, 2
+	vpinsrd	xmm5, xmm5, edx, 3
+	vmovd	xmm6, edx
+	vpbroadcastd	xmm6, xmm6
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	ecx, dword ptr [rbx + 4]
+	mov	edx, dword ptr [rbx + 8]
+	mov	esi, edx
+	shld	esi, ecx, 1
+	vmovd	xmm5, ecx
+	vpbroadcastd	xmm6, xmm5
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	vpbroadcastd	ymm5, dword ptr [rbx + 8]
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 12
+	add	rax, -1
+	jne	.LBB0_13
+	jmp	.LBB0_147
+.LBB0_56:
+	cmp	ecx, 18
+	je	.LBB0_117
+# %bb.57:
+	cmp	ecx, 19
+	jne	.LBB0_147
+# %bb.58:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.59:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 72
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_68] # ymm0 = [2251795519242239,2251795519242239,2251795519242239,2251795519242239]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_67] # ymm1 = [0,0,6,0,12,0,0,5]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_69] # ymm2 = [0,11,0,0,4,0,10,0]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_70] # ymm3 = [0,3,0,9,0,0,2,0]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_71] # ymm4 = [8,0,0,1,0,7,0,13]
+	.p2align	4, 0x90
+.LBB0_60:                               # =>This Inner Loop Header: Depth=1
+	mov	r9d, dword ptr [rbx - 56]
+	mov	edx, dword ptr [rbx - 60]
+	mov	esi, r9d
+	shld	esi, edx, 14
+	mov	edi, dword ptr [rbx - 64]
+	mov	r10d, dword ptr [rbx - 72]
+	shld	edx, edi, 1
+	mov	eax, dword ptr [rbx - 68]
+	mov	ecx, eax
+	shld	ecx, r10d, 13
+	vmovd	xmm5, edi
+	shld	edi, eax, 7
+	vpinsrd	xmm5, xmm5, edx, 1
+	vmovd	xmm6, r10d
+	vpinsrd	xmm5, xmm5, esi, 2
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vpinsrd	xmm6, xmm6, eax, 2
+	vpinsrd	xmm6, xmm6, edi, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	r10d, dword ptr [rbx - 40]
+	mov	r9d, dword ptr [rbx - 36]
+	shld	r9d, r10d, 3
+	mov	edx, dword ptr [rbx - 44]
+	mov	esi, r10d
+	shld	esi, edx, 9
+	mov	edi, dword ptr [rbx - 48]
+	vmovd	xmm5, edx
+	shld	edx, edi, 15
+	mov	ecx, dword ptr [rbx - 56]
+	mov	eax, dword ptr [rbx - 52]
+	shld	edi, eax, 2
+	shrd	ecx, eax, 24
+	vpinsrd	xmm5, xmm5, esi, 1
+	vmovd	xmm6, ecx
+	vpinsrd	xmm5, xmm5, r10d, 2
+	vpinsrd	xmm6, xmm6, eax, 1
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vpinsrd	xmm6, xmm6, edi, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	r10d, dword ptr [rbx - 20]
+	mov	r9d, dword ptr [rbx - 16]
+	shld	r9d, r10d, 11
+	mov	edx, dword ptr [rbx - 24]
+	mov	esi, r10d
+	mov	r11d, dword ptr [rbx - 28]
+	shld	esi, edx, 17
+	mov	ecx, dword ptr [rbx - 36]
+	mov	eax, dword ptr [rbx - 32]
+	shld	edx, r11d, 4
+	mov	edi, r11d
+	shld	edi, eax, 10
+	shrd	ecx, eax, 16
+	vmovd	xmm5, edx
+	vpinsrd	xmm5, xmm5, esi, 1
+	vmovd	xmm6, ecx
+	vpinsrd	xmm5, xmm5, r10d, 2
+	vpinsrd	xmm6, xmm6, eax, 1
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vpinsrd	xmm6, xmm6, edi, 2
+	vpinsrd	xmm6, xmm6, r11d, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	r9d, dword ptr [rbx]
+	mov	r11d, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, r11d, 6
+	mov	ecx, dword ptr [rbx - 8]
+	mov	edi, r11d
+	shld	edi, ecx, 12
+	mov	r10d, dword ptr [rbx - 16]
+	mov	eax, dword ptr [rbx - 12]
+	mov	esi, ecx
+	shld	esi, eax, 18
+	shld	eax, r10d, 5
+	vmovd	xmm5, r10d
+	vmovd	xmm6, edi
+	vpinsrd	xmm5, xmm5, eax, 1
+	vpinsrd	xmm6, xmm6, r11d, 1
+	vpinsrd	xmm5, xmm5, esi, 2
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm5, xmm5, ecx, 3
+	vpinsrd	xmm6, xmm6, r9d, 3
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 76
+	add	r8, -1
+	jne	.LBB0_60
+	jmp	.LBB0_147
+.LBB0_32:
+	cmp	ecx, 10
+	je	.LBB0_129
+# %bb.33:
+	cmp	ecx, 11
+	jne	.LBB0_147
+# %bb.34:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.35:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 40
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_97] # ymm0 = [8791798056959,8791798056959,8791798056959,8791798056959]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_96] # ymm1 = [0,11,0,1,12,0,2,13]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_98] # ymm2 = [0,3,14,0,4,15,0,5]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_99] # ymm3 = [16,0,6,17,0,7,18,0]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_100] # ymm4 = [8,19,0,9,20,0,10,21]
+	.p2align	4, 0x90
+.LBB0_36:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 32]
+	mov	edx, dword ptr [rbx - 40]
+	mov	esi, dword ptr [rbx - 36]
+	mov	edi, ecx
+	shld	edi, esi, 9
+	mov	eax, esi
+	shld	eax, edx, 10
+	vmovd	xmm5, esi
+	vpinsrd	xmm5, xmm5, edi, 1
+	vpinsrd	xmm5, xmm5, ecx, 2
+	vpinsrd	xmm5, xmm5, ecx, 3
+	vmovd	xmm6, edx
+	vpinsrd	xmm6, xmm6, edx, 1
+	vpinsrd	xmm6, xmm6, eax, 2
+	vpinsrd	xmm6, xmm6, esi, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	eax, dword ptr [rbx - 20]
+	mov	ecx, dword ptr [rbx - 24]
+	mov	edx, eax
+	shld	edx, ecx, 6
+	mov	esi, dword ptr [rbx - 32]
+	mov	edi, dword ptr [rbx - 28]
+	vmovd	xmm5, ecx
+	vpinsrd	xmm5, xmm5, ecx, 1
+	shld	ecx, edi, 7
+	shrd	esi, edi, 24
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, edi, 1
+	vpinsrd	xmm6, xmm6, edi, 2
+	vpinsrd	xmm6, xmm6, ecx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	eax, dword ptr [rbx - 12]
+	mov	ecx, dword ptr [rbx - 8]
+	shld	ecx, eax, 3
+	mov	r9d, dword ptr [rbx - 20]
+	mov	esi, dword ptr [rbx - 16]
+	mov	edi, eax
+	shld	edi, esi, 4
+	mov	edx, esi
+	shld	edx, r9d, 5
+	vmovd	xmm5, edi
+	vpinsrd	xmm5, xmm5, eax, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, ecx, 3
+	vmovd	xmm6, r9d
+	vpinsrd	xmm6, xmm6, edx, 1
+	vpinsrd	xmm6, xmm6, esi, 2
+	vpinsrd	xmm6, xmm6, esi, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	eax, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 8]
+	mov	edx, dword ptr [rbx - 4]
+	mov	esi, eax
+	shld	esi, edx, 1
+	mov	edi, edx
+	shld	edi, ecx, 2
+	vmovd	xmm5, edx
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, edi, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 44
+	add	r8, -1
+	jne	.LBB0_36
+	jmp	.LBB0_147
+.LBB0_79:
+	cmp	ecx, 26
+	je	.LBB0_105
+# %bb.80:
+	cmp	ecx, 27
+	jne	.LBB0_147
+# %bb.81:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.82:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 104
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_25] # ymm0 = [576460748142673919,576460748142673919,576460748142673919,576460748142673919]
+	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_24] # ymm9 = [0,0,0,0,0,0,2,0]
+	vmovdqa	xmm10, xmmword ptr [rip + .LCPI0_26] # xmm10 = [24,19,14,9]
+	vmovdqa	xmm11, xmmword ptr [rip + .LCPI0_27] # xmm11 = [8,13,18,23]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_28] # ymm4 = [0,0,0,0,4,0,0,0]
+	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_29] # xmm5 = <16,11,u,u>
+	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_30] # xmm6 = <16,21,u,u>
+	vmovdqa	ymm7, ymmword ptr [rip + .LCPI0_31] # ymm7 = [0,0,0,1,0,0,0,0]
+	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_32] # ymm8 = [0,3,0,0,0,0,0,5]
+	.p2align	4, 0x90
+.LBB0_83:                               # =>This Inner Loop Header: Depth=1
+	mov	r10d, dword ptr [rbx - 84]
+	mov	r9d, dword ptr [rbx - 80]
+	shld	r9d, r10d, 3
+	mov	esi, dword ptr [rbx - 88]
+	mov	edi, r10d
+	shld	edi, esi, 25
+	mov	eax, dword ptr [rbx - 92]
+	shld	esi, eax, 20
+	mov	edx, dword ptr [rbx - 96]
+	shld	eax, edx, 15
+	mov	r11d, dword ptr [rbx - 104]
+	mov	ecx, dword ptr [rbx - 100]
+	shld	edx, ecx, 10
+	shld	ecx, r11d, 5
+	vmovd	xmm1, r11d
+	vmovd	xmm2, esi
+	vpinsrd	xmm1, xmm1, ecx, 1
+	vpinsrd	xmm2, xmm2, edi, 1
+	vpinsrd	xmm1, xmm1, edx, 2
+	vpinsrd	xmm2, xmm2, r10d, 2
+	vpinsrd	xmm1, xmm1, eax, 3
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vinserti128	ymm1, ymm1, xmm2, 1
+	vpsrlvd	ymm1, ymm1, ymm9
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm1
+	mov	eax, dword ptr [rbx - 56]
+	mov	ecx, dword ptr [rbx - 52]
+	shld	ecx, eax, 11
+	mov	edx, dword ptr [rbx - 60]
+	mov	esi, dword ptr [rbx - 64]
+	shld	eax, edx, 6
+	shld	edx, esi, 1
+	vmovdqu	xmm1, xmmword ptr [rbx - 80]
+	vpsrlvd	xmm2, xmm1, xmm10
+	vpshufd	xmm1, xmm1, 249                 # xmm1 = xmm1[1,2,3,3]
+	vmovd	xmm3, esi
+	vpinsrd	xmm1, xmm1, esi, 3
+	vpinsrd	xmm3, xmm3, edx, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpsllvd	xmm1, xmm1, xmm11
+	vpinsrd	xmm3, xmm3, ecx, 3
+	vpor	xmm1, xmm2, xmm1
+	vinserti128	ymm1, ymm1, xmm3, 1
+	vpsrlvd	ymm1, ymm1, ymm4
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm1
+	mov	eax, dword ptr [rbx - 28]
+	mov	r9d, dword ptr [rbx - 24]
+	shld	r9d, eax, 19
+	mov	edx, dword ptr [rbx - 32]
+	shld	eax, edx, 14
+	mov	esi, dword ptr [rbx - 36]
+	shld	edx, esi, 9
+	mov	r10d, dword ptr [rbx - 44]
+	mov	edi, dword ptr [rbx - 40]
+	shld	esi, edi, 4
+	mov	ecx, edi
+	shld	ecx, r10d, 26
+	vmovq	xmm1, qword ptr [rbx - 52]      # xmm1 = mem[0],zero
+	vpsrlvd	xmm2, xmm1, xmm5
+	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
+	vpinsrd	xmm1, xmm1, r10d, 1
+	vpsllvd	xmm1, xmm1, xmm6
+	vmovd	xmm3, esi
+	vpinsrd	xmm3, xmm3, edx, 1
+	vpor	xmm1, xmm2, xmm1
+	vpinsrd	xmm2, xmm3, eax, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vpinsrd	xmm1, xmm1, ecx, 2
+	vpinsrd	xmm1, xmm1, edi, 3
+	vinserti128	ymm1, ymm1, xmm2, 1
+	vpsrlvd	ymm1, ymm1, ymm7
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm1
+	mov	r9d, dword ptr [rbx]
+	mov	r11d, dword ptr [rbx - 4]
+	mov	r10d, r9d
+	shld	r10d, r11d, 22
+	mov	esi, dword ptr [rbx - 8]
+	shld	r11d, esi, 17
+	mov	edi, dword ptr [rbx - 12]
+	mov	eax, dword ptr [rbx - 16]
+	shld	esi, edi, 12
+	mov	edx, dword ptr [rbx - 24]
+	mov	ecx, dword ptr [rbx - 20]
+	shld	edi, eax, 7
+	shrd	edx, ecx, 8
+	shld	eax, ecx, 2
+	vmovd	xmm1, esi
+	vpinsrd	xmm1, xmm1, r11d, 1
+	vmovd	xmm2, edx
+	vpinsrd	xmm1, xmm1, r10d, 2
+	vpinsrd	xmm2, xmm2, ecx, 1
+	vpinsrd	xmm1, xmm1, r9d, 3
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm2, xmm2, edi, 3
+	vinserti128	ymm1, ymm2, xmm1, 1
+	vpsrlvd	ymm1, ymm1, ymm8
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15], ymm1
+	sub	r15, -128
+	add	rbx, 108
+	add	r8, -1
+	jne	.LBB0_83
+	jmp	.LBB0_147
+.LBB0_20:
+	cmp	ecx, 6
+	je	.LBB0_135
+# %bb.21:
+	cmp	ecx, 7
+	jne	.LBB0_147
+# %bb.22:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.23:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 24
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_112] # ymm0 = [545460846719,545460846719,545460846719,545460846719]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_111] # ymm1 = [0,7,14,21,0,3,10,17]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_113] # ymm2 = [24,0,6,13,20,0,2,9]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_114] # ymm3 = [16,23,0,5,12,19,0,1]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_115] # ymm4 = [8,15,22,0,4,11,18,25]
+	.p2align	4, 0x90
+.LBB0_24:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 24]
+	mov	edx, dword ptr [rbx - 20]
+	mov	esi, edx
+	shld	esi, ecx, 4
+	vmovd	xmm5, ecx
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, edx, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vpbroadcastd	xmm5, xmm5
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	ecx, dword ptr [rbx - 12]
+	mov	edx, dword ptr [rbx - 20]
+	mov	esi, dword ptr [rbx - 16]
+	mov	edi, ecx
+	shld	edi, esi, 5
+	mov	eax, esi
+	shld	eax, edx, 1
+	vmovd	xmm5, esi
+	vpinsrd	xmm5, xmm5, edi, 1
+	vpinsrd	xmm5, xmm5, ecx, 2
+	vpinsrd	xmm5, xmm5, ecx, 3
+	vmovd	xmm6, edx
+	vpinsrd	xmm6, xmm6, eax, 1
+	vpinsrd	xmm6, xmm6, esi, 2
+	vpinsrd	xmm6, xmm6, esi, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	eax, dword ptr [rbx - 4]
+	mov	ecx, dword ptr [rbx - 12]
+	mov	edx, dword ptr [rbx - 8]
+	mov	esi, eax
+	shld	esi, edx, 6
+	mov	edi, edx
+	shld	edi, ecx, 2
+	vmovd	xmm5, edx
+	vpinsrd	xmm5, xmm5, edx, 1
+	vpinsrd	xmm5, xmm5, esi, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, edi, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	eax, dword ptr [rbx - 4]
+	mov	ecx, dword ptr [rbx]
+	mov	edx, ecx
+	shld	edx, eax, 3
+	vmovd	xmm5, ecx
+	vmovd	xmm6, eax
+	vpinsrd	xmm6, xmm6, eax, 1
+	vpinsrd	xmm6, xmm6, eax, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vpbroadcastd	xmm5, xmm5
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 28
+	add	r8, -1
+	jne	.LBB0_24
+	jmp	.LBB0_147
+.LBB0_67:
+	cmp	ecx, 22
+	je	.LBB0_111
+# %bb.68:
+	cmp	ecx, 23
+	jne	.LBB0_147
+# %bb.69:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.70:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 88
+	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_48] # ymm8 = [0,0,0,5,0,0,0,1]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_49] # ymm1 = [36028792732385279,36028792732385279,36028792732385279,36028792732385279]
+	vmovdqa	xmm2, xmmword ptr [rip + .LCPI0_50] # xmm2 = <24,15,u,u>
+	vmovdqa	xmm3, xmmword ptr [rip + .LCPI0_51] # xmm3 = <8,17,u,u>
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_52] # ymm4 = [0,0,6,0,0,0,2,0]
+	vmovdqa	ymm5, ymmword ptr [rip + .LCPI0_53] # ymm5 = [0,7,0,0,0,3,0,0]
+	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_54] # ymm6 = [8,0,0,0,4,0,0,9]
+	.p2align	4, 0x90
+.LBB0_71:                               # =>This Inner Loop Header: Depth=1
+	mov	r9d, dword ptr [rbx - 68]
+	mov	edx, dword ptr [rbx - 72]
+	mov	r11d, r9d
+	shld	r11d, edx, 22
+	mov	edi, dword ptr [rbx - 76]
+	shld	edx, edi, 13
+	mov	esi, dword ptr [rbx - 80]
+	shld	edi, esi, 4
+	mov	r10d, dword ptr [rbx - 88]
+	mov	ecx, dword ptr [rbx - 84]
+	mov	eax, esi
+	shld	eax, ecx, 18
+	shld	ecx, r10d, 9
+	vmovd	xmm7, r10d
+	vmovd	xmm0, edi
+	vpinsrd	xmm7, xmm7, ecx, 1
+	vpinsrd	xmm0, xmm0, edx, 1
+	vpinsrd	xmm7, xmm7, eax, 2
+	vpinsrd	xmm0, xmm0, r11d, 2
+	vpinsrd	xmm7, xmm7, esi, 3
+	vpinsrd	xmm0, xmm0, r9d, 3
+	vinserti128	ymm0, ymm7, xmm0, 1
+	vpsrlvd	ymm0, ymm0, ymm8
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm0
+	mov	eax, dword ptr [rbx - 48]
+	mov	r9d, dword ptr [rbx - 44]
+	shld	r9d, eax, 7
+	mov	edx, dword ptr [rbx - 52]
+	mov	esi, eax
+	shld	esi, edx, 21
+	mov	edi, dword ptr [rbx - 60]
+	mov	ecx, dword ptr [rbx - 56]
+	shld	edx, ecx, 12
+	shld	ecx, edi, 3
+	vmovq	xmm0, qword ptr [rbx - 68]      # xmm0 = mem[0],zero
+	vpsrlvd	xmm7, xmm0, xmm2
+	vpshufd	xmm0, xmm0, 229                 # xmm0 = xmm0[1,1,2,3]
+	vpinsrd	xmm0, xmm0, edi, 1
+	vpsllvd	xmm0, xmm0, xmm3
+	vpor	xmm0, xmm7, xmm0
+	vmovd	xmm7, edx
+	vpinsrd	xmm7, xmm7, esi, 1
+	vpinsrd	xmm7, xmm7, eax, 2
+	vpinsrd	xmm7, xmm7, r9d, 3
+	vpinsrd	xmm0, xmm0, edi, 2
+	vpinsrd	xmm0, xmm0, ecx, 3
+	vinserti128	ymm0, ymm0, xmm7, 1
+	vpsrlvd	ymm0, ymm0, ymm4
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm0
+	mov	r11d, dword ptr [rbx - 24]
+	mov	r9d, dword ptr [rbx - 20]
+	shld	r9d, r11d, 15
+	mov	r10d, dword ptr [rbx - 28]
+	shld	r11d, r10d, 6
+	mov	esi, dword ptr [rbx - 32]
+	mov	edi, r10d
+	mov	ecx, dword ptr [rbx - 36]
+	shld	edi, esi, 20
+	mov	edx, dword ptr [rbx - 44]
+	mov	eax, dword ptr [rbx - 40]
+	shld	esi, ecx, 11
+	shrd	edx, eax, 16
+	shld	ecx, eax, 2
+	vmovd	xmm0, edi
+	vpinsrd	xmm0, xmm0, r10d, 1
+	vmovd	xmm7, edx
+	vpinsrd	xmm0, xmm0, r11d, 2
+	vpinsrd	xmm7, xmm7, eax, 1
+	vpinsrd	xmm0, xmm0, r9d, 3
+	vpinsrd	xmm7, xmm7, ecx, 2
+	vpinsrd	xmm7, xmm7, esi, 3
+	vinserti128	ymm0, ymm7, xmm0, 1
+	vpsrlvd	ymm0, ymm0, ymm5
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm0
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, ecx, 14
+	mov	esi, dword ptr [rbx - 8]
+	shld	ecx, esi, 5
+	mov	edi, dword ptr [rbx - 12]
+	vmovd	xmm0, esi
+	shld	esi, edi, 19
+	mov	r10d, dword ptr [rbx - 20]
+	mov	eax, dword ptr [rbx - 16]
+	shld	edi, eax, 10
+	shld	eax, r10d, 1
+	vpinsrd	xmm0, xmm0, ecx, 1
+	vmovd	xmm7, r10d
+	vpinsrd	xmm0, xmm0, edx, 2
+	vpinsrd	xmm7, xmm7, eax, 1
+	vpinsrd	xmm0, xmm0, r9d, 3
+	vpinsrd	xmm7, xmm7, edi, 2
+	vpinsrd	xmm7, xmm7, esi, 3
+	vinserti128	ymm0, ymm7, xmm0, 1
+	vpsrlvd	ymm0, ymm0, ymm6
+	vpand	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r15], ymm0
+	sub	r15, -128
+	add	rbx, 92
+	add	r8, -1
+	jne	.LBB0_71
+	jmp	.LBB0_147
+.LBB0_43:
+	cmp	ecx, 14
+	je	.LBB0_123
+# %bb.44:
+	cmp	ecx, 15
+	jne	.LBB0_147
+# %bb.45:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.46:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 56
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_82] # ymm0 = [140733193420799,140733193420799,140733193420799,140733193420799]
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_81] # ymm1 = [0,15,0,13,0,11,0,9]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_83] # ymm2 = [0,7,0,5,0,3,0,1]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_84] # ymm3 = [16,0,14,0,12,0,10,0]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_85] # ymm4 = [8,0,6,0,4,0,2,17]
+	.p2align	4, 0x90
+.LBB0_47:                               # =>This Inner Loop Header: Depth=1
+	mov	r9d, dword ptr [rbx - 44]
+	mov	eax, dword ptr [rbx - 48]
+	mov	esi, r9d
+	shld	esi, eax, 6
+	mov	r10d, dword ptr [rbx - 52]
+	mov	edx, eax
+	shld	edx, r10d, 4
+	mov	ecx, dword ptr [rbx - 56]
+	mov	edi, r10d
+	shld	edi, ecx, 2
+	vmovd	xmm5, edx
+	vpinsrd	xmm5, xmm5, eax, 1
+	vpinsrd	xmm5, xmm5, esi, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vmovd	xmm6, ecx
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, edi, 2
+	vpinsrd	xmm6, xmm6, r10d, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm1
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	r9d, dword ptr [rbx - 28]
+	mov	r11d, dword ptr [rbx - 32]
+	mov	edx, r9d
+	shld	edx, r11d, 14
+	mov	r10d, dword ptr [rbx - 36]
+	mov	edi, r11d
+	shld	edi, r10d, 12
+	mov	eax, dword ptr [rbx - 44]
+	mov	esi, dword ptr [rbx - 40]
+	mov	ecx, r10d
+	shld	ecx, esi, 10
+	shrd	eax, esi, 24
+	vmovd	xmm5, edi
+	vpinsrd	xmm5, xmm5, r11d, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vmovd	xmm6, eax
+	vpinsrd	xmm6, xmm6, esi, 1
+	vpinsrd	xmm6, xmm6, ecx, 2
+	vpinsrd	xmm6, xmm6, r10d, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	eax, dword ptr [rbx - 16]
+	mov	r10d, dword ptr [rbx - 12]
+	shld	r10d, eax, 7
+	mov	edx, dword ptr [rbx - 20]
+	mov	esi, eax
+	shld	esi, edx, 5
+	mov	r9d, dword ptr [rbx - 28]
+	mov	ecx, dword ptr [rbx - 24]
+	mov	edi, ecx
+	shld	edi, r9d, 1
+	vmovd	xmm5, edx
+	shld	edx, ecx, 3
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm5, xmm5, eax, 2
+	vpinsrd	xmm5, xmm5, r10d, 3
+	vmovd	xmm6, r9d
+	vpinsrd	xmm6, xmm6, edi, 1
+	vpinsrd	xmm6, xmm6, ecx, 2
+	vpinsrd	xmm6, xmm6, edx, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm3
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, ecx, 13
+	mov	eax, dword ptr [rbx - 8]
+	vmovd	xmm5, ecx
+	shld	ecx, eax, 11
+	mov	edi, dword ptr [rbx - 12]
+	mov	esi, eax
+	shld	esi, edi, 9
+	vmovd	xmm6, edi
+	vpinsrd	xmm6, xmm6, esi, 1
+	vpinsrd	xmm6, xmm6, eax, 2
+	vpinsrd	xmm6, xmm6, ecx, 3
+	vpinsrd	xmm5, xmm5, edx, 1
+	vpinsrd	xmm5, xmm5, r9d, 2
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vinserti128	ymm5, ymm6, xmm5, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 60
+	add	r8, -1
+	jne	.LBB0_47
+	jmp	.LBB0_147
+.LBB0_96:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.97:
+	mov	r8d, r14d
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_0] # ymm0 = [9223372034707292159,9223372034707292159,9223372034707292159,9223372034707292159]
+	add	r15, 96
+	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [24,23,22,21,20,19,18,17]
+	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_2] # ymm9 = [8,9,10,11,12,13,14,15]
+	vmovdqa	ymm10, ymmword ptr [rip + .LCPI0_3] # ymm10 = [16,15,14,13,12,11,10,9]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_4] # ymm4 = [16,17,18,19,20,21,22,23]
+	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_5] # xmm5 = [8,7,6,5]
+	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_6] # xmm6 = [24,25,26,27]
+	vmovdqa	ymm7, ymmword ptr [rip + .LCPI0_7] # ymm7 = [0,0,0,0,0,0,0,1]
+	.p2align	4, 0x90
+.LBB0_98:                               # =>This Inner Loop Header: Depth=1
+	mov	r10d, dword ptr [rbx + 24]
+	mov	r9d, dword ptr [rbx + 28]
+	shld	r9d, r10d, 7
+	mov	esi, dword ptr [rbx + 20]
+	shld	r10d, esi, 6
+	mov	edi, dword ptr [rbx + 16]
+	shld	esi, edi, 5
+	mov	eax, dword ptr [rbx + 12]
+	shld	edi, eax, 4
+	mov	edx, dword ptr [rbx + 8]
+	shld	eax, edx, 3
+	mov	ecx, dword ptr [rbx + 4]
+	shld	edx, ecx, 2
+	mov	r11d, dword ptr [rbx]
+	shld	ecx, r11d, 1
+	vmovd	xmm1, edi
+	vpinsrd	xmm1, xmm1, esi, 1
+	vpinsrd	xmm1, xmm1, r10d, 2
+	vpinsrd	xmm1, xmm1, r9d, 3
+	vmovd	xmm2, r11d
+	vpinsrd	xmm2, xmm2, ecx, 1
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm2, xmm2, eax, 3
+	vinserti128	ymm1, ymm2, xmm1, 1
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm1
+	vmovdqu	ymm1, ymmword ptr [rbx + 28]
+	vpsrlvd	ymm1, ymm1, ymm8
+	vmovdqu	xmm2, xmmword ptr [rbx + 44]
+	vpshufd	xmm3, xmm2, 249                 # xmm3 = xmm2[1,2,3,3]
+	vpinsrd	xmm3, xmm3, dword ptr [rbx + 60], 3
+	vpalignr	xmm2, xmm2, xmmword ptr [rbx + 28], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpsllvd	ymm2, ymm2, ymm9
+	vpor	ymm1, ymm1, ymm2
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm1
+	vmovdqu	ymm1, ymmword ptr [rbx + 60]
+	vmovdqu	xmm2, xmmword ptr [rbx + 76]
+	vpshufd	xmm3, xmm2, 249                 # xmm3 = xmm2[1,2,3,3]
+	vpinsrd	xmm3, xmm3, dword ptr [rbx + 92], 3
+	vpsrlvd	ymm1, ymm1, ymm10
+	vpalignr	xmm2, xmm2, xmmword ptr [rbx + 60], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpsllvd	ymm2, ymm2, ymm4
+	vpor	ymm1, ymm1, ymm2
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm1
+	mov	eax, dword ptr [rbx + 120]
+	mov	ecx, dword ptr [rbx + 116]
+	mov	edx, eax
+	shld	edx, ecx, 30
+	mov	esi, dword ptr [rbx + 112]
+	shld	ecx, esi, 29
+	mov	edi, dword ptr [rbx + 108]
+	shld	esi, edi, 28
+	vmovdqu	xmm1, xmmword ptr [rbx + 92]
+	vpsrlvd	xmm2, xmm1, xmm5
+	vpshufd	xmm1, xmm1, 249                 # xmm1 = xmm1[1,2,3,3]
+	vpinsrd	xmm1, xmm1, edi, 3
+	vpsllvd	xmm1, xmm1, xmm6
+	vmovd	xmm3, esi
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, edx, 2
+	vpinsrd	xmm3, xmm3, eax, 3
+	vpor	xmm1, xmm2, xmm1
+	vinserti128	ymm1, ymm1, xmm3, 1
+	vpsrlvd	ymm1, ymm1, ymm7
+	vpand	ymm1, ymm1, ymm0
+	vmovdqu	ymmword ptr [r15], ymm1
+	add	rbx, 124
+	sub	r15, -128
+	add	r8, -1
+	jne	.LBB0_98
+	jmp	.LBB0_147
+.LBB0_144:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.145:
+	mov	ebx, r14d
+	.p2align	4, 0x90
+.LBB0_146:                              # =>This Inner Loop Header: Depth=1
+	mov	edx, 128
+	mov	rdi, r15
+	xor	esi, esi
+	call	clib·_memset(SB)
+	sub	r15, -128
+	add	rbx, -1
+	jne	.LBB0_146
+	jmp	.LBB0_147
+.LBB0_120:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.121:
+	mov	eax, r14d
+	xor	ecx, ecx
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_80] # ymm0 = [68719476736,68719476736,68719476736,68719476736]
+	vpxor	xmm1, xmm1, xmm1
+	.p2align	4, 0x90
+.LBB0_122:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	xmm2, xmmword ptr [rbx + rcx]
+	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
+	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+	vmovdqu	ymmword ptr [r15 + 2*rcx], ymm2
+	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 16]
+	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
+	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+	vmovdqu	ymmword ptr [r15 + 2*rcx + 32], ymm2
+	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 32]
+	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
+	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+	vmovdqu	ymmword ptr [r15 + 2*rcx + 64], ymm2
+	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 48]
+	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
+	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+	vmovdqu	ymmword ptr [r15 + 2*rcx + 96], ymm2
+	add	rcx, 64
+	add	rax, -1
+	jne	.LBB0_122
+	jmp	.LBB0_147
+.LBB0_132:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.133:
+	mov	eax, r14d
+	xor	ecx, ecx
+	vbroadcasti128	ymm0, xmmword ptr [rip + .LCPI0_109] # ymm0 = [0,8,16,24,0,8,16,24]
+                                        # ymm0 = mem[0,1,0,1]
+	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_110] # ymm1 = [255,255,255,255,255,255,255,255]
+	.p2align	4, 0x90
+.LBB0_134:                              # =>This Inner Loop Header: Depth=1
+	vmovq	xmm2, qword ptr [rbx + rcx]     # xmm2 = mem[0],zero
+	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
+	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 4*rcx], ymm2
+	vmovq	xmm2, qword ptr [rbx + rcx + 8] # xmm2 = mem[0],zero
+	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
+	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 4*rcx + 32], ymm2
+	vmovq	xmm2, qword ptr [rbx + rcx + 16] # xmm2 = mem[0],zero
+	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
+	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 4*rcx + 64], ymm2
+	vmovq	xmm2, qword ptr [rbx + rcx + 24] # xmm2 = mem[0],zero
+	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
+	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 4*rcx + 96], ymm2
+	add	rcx, 32
+	add	rax, -1
+	jne	.LBB0_134
+	jmp	.LBB0_147
+.LBB0_108:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.109:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 92
+	vbroadcasti128	ymm0, xmmword ptr [rip + .LCPI0_46] # ymm0 = [0,0,0,8,0,0,0,8]
+                                        # ymm0 = mem[0,1,0,1]
+	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_47] # ymm1 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
+	.p2align	4, 0x90
+.LBB0_110:                              # =>This Inner Loop Header: Depth=1
+	mov	r9d, dword ptr [rbx - 72]
+	mov	edx, dword ptr [rbx - 76]
+	mov	esi, r9d
+	mov	edi, dword ptr [rbx - 80]
+	mov	r10d, dword ptr [rbx - 84]
+	shld	esi, edx, 16
+	mov	r11d, dword ptr [rbx - 92]
+	mov	eax, dword ptr [rbx - 88]
+	shld	edx, edi, 8
+	mov	ecx, r10d
+	shld	ecx, eax, 16
+	shld	eax, r11d, 8
+	vmovd	xmm2, edi
+	vmovd	xmm3, r11d
+	vpinsrd	xmm2, xmm2, edx, 1
+	vpinsrd	xmm3, xmm3, eax, 1
+	vpinsrd	xmm2, xmm2, esi, 2
+	vpinsrd	xmm3, xmm3, ecx, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vpinsrd	xmm3, xmm3, r10d, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm2
+	mov	r9d, dword ptr [rbx - 48]
+	mov	ecx, dword ptr [rbx - 52]
+	mov	edx, r9d
+	mov	esi, dword ptr [rbx - 56]
+	mov	r10d, dword ptr [rbx - 60]
+	shld	edx, ecx, 16
+	mov	r11d, dword ptr [rbx - 68]
+	mov	edi, dword ptr [rbx - 64]
+	shld	ecx, esi, 8
+	mov	eax, r10d
+	shld	eax, edi, 16
+	shld	edi, r11d, 8
+	vmovd	xmm2, esi
+	vmovd	xmm3, r11d
+	vpinsrd	xmm2, xmm2, ecx, 1
+	vpinsrd	xmm3, xmm3, edi, 1
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vpinsrd	xmm3, xmm3, r10d, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm2
+	mov	r9d, dword ptr [rbx - 24]
+	mov	ecx, dword ptr [rbx - 28]
+	mov	edx, r9d
+	mov	esi, dword ptr [rbx - 32]
+	mov	r10d, dword ptr [rbx - 36]
+	shld	edx, ecx, 16
+	mov	r11d, dword ptr [rbx - 44]
+	mov	edi, dword ptr [rbx - 40]
+	shld	ecx, esi, 8
+	mov	eax, r10d
+	shld	eax, edi, 16
+	shld	edi, r11d, 8
+	vmovd	xmm2, esi
+	vmovd	xmm3, r11d
+	vpinsrd	xmm2, xmm2, ecx, 1
+	vpinsrd	xmm3, xmm3, edi, 1
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vpinsrd	xmm3, xmm3, r10d, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm2
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, r9d
+	mov	esi, dword ptr [rbx - 8]
+	mov	r10d, dword ptr [rbx - 12]
+	shld	edx, ecx, 16
+	mov	r11d, dword ptr [rbx - 20]
+	mov	edi, dword ptr [rbx - 16]
+	shld	ecx, esi, 8
+	mov	eax, r10d
+	shld	eax, edi, 16
+	shld	edi, r11d, 8
+	vmovd	xmm2, esi
+	vpinsrd	xmm2, xmm2, ecx, 1
+	vmovd	xmm3, r11d
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm3, xmm3, edi, 1
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm3, xmm3, r10d, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15], ymm2
+	sub	r15, -128
+	add	rbx, 96
+	add	r8, -1
+	jne	.LBB0_110
+	jmp	.LBB0_147
+.LBB0_138:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.139:
+	mov	eax, r14d
+	xor	ecx, ecx
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_124] # ymm0 = [0,4,8,12,16,20,24,28]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_125] # ymm1 = [64424509455,64424509455,64424509455,64424509455]
+	.p2align	4, 0x90
+.LBB0_140:                              # =>This Inner Loop Header: Depth=1
+	vpbroadcastd	ymm2, dword ptr [rbx + rcx]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 8*rcx], ymm2
+	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 4]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 8*rcx + 32], ymm2
+	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 8]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 8*rcx + 64], ymm2
+	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 12]
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 + 8*rcx + 96], ymm2
+	add	rcx, 16
+	add	rax, -1
+	jne	.LBB0_140
+	jmp	.LBB0_147
+.LBB0_114:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.115:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 76
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_65] # ymm0 = [0,0,8,0,0,4,0,12]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_66] # ymm1 = [4503595333451775,4503595333451775,4503595333451775,4503595333451775]
+	.p2align	4, 0x90
+.LBB0_116:                              # =>This Inner Loop Header: Depth=1
+	mov	r9d, dword ptr [rbx - 60]
+	mov	r11d, dword ptr [rbx - 64]
+	mov	esi, r9d
+	shld	esi, r11d, 8
+	mov	edi, dword ptr [rbx - 68]
+	mov	edx, r11d
+	shld	edx, edi, 16
+	mov	eax, dword ptr [rbx - 72]
+	shld	edi, eax, 4
+	mov	r10d, dword ptr [rbx - 76]
+	mov	ecx, eax
+	shld	ecx, r10d, 12
+	vmovd	xmm2, edx
+	vpinsrd	xmm2, xmm2, r11d, 1
+	vpinsrd	xmm2, xmm2, esi, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vmovd	xmm3, r10d
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm3, xmm3, edi, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm2
+	mov	r9d, dword ptr [rbx - 40]
+	mov	r11d, dword ptr [rbx - 44]
+	mov	edx, r9d
+	shld	edx, r11d, 8
+	mov	esi, dword ptr [rbx - 48]
+	mov	edi, r11d
+	shld	edi, esi, 16
+	mov	r10d, dword ptr [rbx - 56]
+	mov	ecx, dword ptr [rbx - 52]
+	shld	esi, ecx, 4
+	mov	eax, ecx
+	shld	eax, r10d, 12
+	vmovd	xmm2, edi
+	vpinsrd	xmm2, xmm2, r11d, 1
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vmovd	xmm3, r10d
+	vpinsrd	xmm3, xmm3, eax, 1
+	vpinsrd	xmm3, xmm3, ecx, 2
+	vpinsrd	xmm3, xmm3, esi, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm2
+	mov	r9d, dword ptr [rbx - 20]
+	mov	r11d, dword ptr [rbx - 24]
+	mov	edx, r9d
+	shld	edx, r11d, 8
+	mov	esi, dword ptr [rbx - 28]
+	mov	edi, r11d
+	shld	edi, esi, 16
+	mov	ecx, dword ptr [rbx - 32]
+	shld	esi, ecx, 4
+	mov	r10d, dword ptr [rbx - 36]
+	mov	eax, ecx
+	shld	eax, r10d, 12
+	vmovd	xmm2, edi
+	vpinsrd	xmm2, xmm2, r11d, 1
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vmovd	xmm3, r10d
+	vpinsrd	xmm3, xmm3, eax, 1
+	vpinsrd	xmm3, xmm3, ecx, 2
+	vpinsrd	xmm3, xmm3, esi, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm2
+	mov	r9d, dword ptr [rbx]
+	mov	r11d, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, r11d, 8
+	mov	esi, dword ptr [rbx - 8]
+	mov	edi, r11d
+	shld	edi, esi, 16
+	mov	r10d, dword ptr [rbx - 16]
+	mov	ecx, dword ptr [rbx - 12]
+	shld	esi, ecx, 4
+	mov	eax, ecx
+	shld	eax, r10d, 12
+	vmovd	xmm2, edi
+	vpinsrd	xmm2, xmm2, r11d, 1
+	vpinsrd	xmm2, xmm2, edx, 2
+	vpinsrd	xmm2, xmm2, r9d, 3
+	vmovd	xmm3, r10d
+	vpinsrd	xmm3, xmm3, eax, 1
+	vpinsrd	xmm3, xmm3, ecx, 2
+	vpinsrd	xmm3, xmm3, esi, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15], ymm2
+	sub	r15, -128
+	add	rbx, 80
+	add	r8, -1
+	jne	.LBB0_116
+	jmp	.LBB0_147
+.LBB0_126:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.127:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 44
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_94] # ymm0 = [0,12,0,4,16,0,8,20]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_95] # ymm1 = [17587891081215,17587891081215,17587891081215,17587891081215]
+	.p2align	4, 0x90
+.LBB0_128:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 36]
+	mov	edx, dword ptr [rbx - 44]
+	mov	esi, dword ptr [rbx - 40]
+	mov	edi, ecx
+	shld	edi, esi, 4
+	mov	eax, esi
+	shld	eax, edx, 8
+	vmovd	xmm2, esi
+	vpinsrd	xmm2, xmm2, edi, 1
+	vpinsrd	xmm2, xmm2, ecx, 2
+	vpinsrd	xmm2, xmm2, ecx, 3
+	vmovd	xmm3, edx
+	vpinsrd	xmm3, xmm3, edx, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm3, xmm3, esi, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm2
+	mov	eax, dword ptr [rbx - 24]
+	mov	ecx, dword ptr [rbx - 32]
+	mov	edx, dword ptr [rbx - 28]
+	mov	esi, eax
+	shld	esi, edx, 4
+	mov	edi, edx
+	shld	edi, ecx, 8
+	vmovd	xmm2, edx
+	vpinsrd	xmm2, xmm2, esi, 1
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm2, xmm2, eax, 3
+	vmovd	xmm3, ecx
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, edi, 2
+	vpinsrd	xmm3, xmm3, edx, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm2
+	mov	eax, dword ptr [rbx - 12]
+	mov	ecx, dword ptr [rbx - 20]
+	mov	edx, dword ptr [rbx - 16]
+	mov	esi, eax
+	shld	esi, edx, 4
+	mov	edi, edx
+	shld	edi, ecx, 8
+	vmovd	xmm2, edx
+	vpinsrd	xmm2, xmm2, esi, 1
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm2, xmm2, eax, 3
+	vmovd	xmm3, ecx
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, edi, 2
+	vpinsrd	xmm3, xmm3, edx, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm2
+	mov	eax, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 8]
+	mov	edx, dword ptr [rbx - 4]
+	mov	esi, eax
+	shld	esi, edx, 4
+	mov	edi, edx
+	shld	edi, ecx, 8
+	vmovd	xmm2, edx
+	vpinsrd	xmm2, xmm2, esi, 1
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm2, xmm2, eax, 3
+	vmovd	xmm3, ecx
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, edi, 2
+	vpinsrd	xmm3, xmm3, edx, 3
+	vinserti128	ymm2, ymm3, xmm2, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15], ymm2
+	sub	r15, -128
+	add	rbx, 48
+	add	r8, -1
+	jne	.LBB0_128
+	jmp	.LBB0_147
+.LBB0_102:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.103:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 108
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_22] # ymm0 = [0,0,0,0,0,0,0,4]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_23] # ymm1 = [1152921500580315135,1152921500580315135,1152921500580315135,1152921500580315135]
+	.p2align	4, 0x90
+.LBB0_104:                              # =>This Inner Loop Header: Depth=1
+	mov	r9d, dword ptr [rbx - 84]
+	mov	edx, dword ptr [rbx - 88]
+	mov	r10d, r9d
+	shld	r10d, edx, 24
+	mov	edi, dword ptr [rbx - 92]
+	shld	edx, edi, 20
+	mov	eax, dword ptr [rbx - 96]
+	shld	edi, eax, 16
+	mov	ecx, dword ptr [rbx - 100]
+	shld	eax, ecx, 12
+	mov	r11d, dword ptr [rbx - 108]
+	mov	esi, dword ptr [rbx - 104]
+	shld	ecx, esi, 8
+	shld	esi, r11d, 4
+	vmovd	xmm2, r11d
+	vmovd	xmm3, edi
+	vpinsrd	xmm2, xmm2, esi, 1
+	vpinsrd	xmm3, xmm3, edx, 1
+	vpinsrd	xmm2, xmm2, ecx, 2
+	vpinsrd	xmm3, xmm3, r10d, 2
+	vpinsrd	xmm2, xmm2, eax, 3
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm2
+	mov	r9d, dword ptr [rbx - 56]
+	mov	ecx, dword ptr [rbx - 60]
+	mov	r10d, r9d
+	shld	r10d, ecx, 24
+	mov	esi, dword ptr [rbx - 64]
+	shld	ecx, esi, 20
+	mov	edi, dword ptr [rbx - 68]
+	shld	esi, edi, 16
+	mov	eax, dword ptr [rbx - 72]
+	shld	edi, eax, 12
+	mov	r11d, dword ptr [rbx - 80]
+	mov	edx, dword ptr [rbx - 76]
+	shld	eax, edx, 8
+	shld	edx, r11d, 4
+	vmovd	xmm2, r11d
+	vmovd	xmm3, esi
+	vpinsrd	xmm2, xmm2, edx, 1
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm3, xmm3, r10d, 2
+	vpinsrd	xmm2, xmm2, edi, 3
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm2
+	mov	r9d, dword ptr [rbx - 28]
+	mov	ecx, dword ptr [rbx - 32]
+	mov	r10d, r9d
+	shld	r10d, ecx, 24
+	mov	esi, dword ptr [rbx - 36]
+	shld	ecx, esi, 20
+	mov	edi, dword ptr [rbx - 40]
+	shld	esi, edi, 16
+	mov	eax, dword ptr [rbx - 44]
+	shld	edi, eax, 12
+	mov	r11d, dword ptr [rbx - 52]
+	mov	edx, dword ptr [rbx - 48]
+	shld	eax, edx, 8
+	shld	edx, r11d, 4
+	vmovd	xmm2, r11d
+	vmovd	xmm3, esi
+	vpinsrd	xmm2, xmm2, edx, 1
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm3, xmm3, r10d, 2
+	vpinsrd	xmm2, xmm2, edi, 3
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm2
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	r10d, r9d
+	shld	r10d, ecx, 24
+	mov	esi, dword ptr [rbx - 8]
+	shld	ecx, esi, 20
+	mov	edi, dword ptr [rbx - 12]
+	shld	esi, edi, 16
+	mov	eax, dword ptr [rbx - 16]
+	shld	edi, eax, 12
+	mov	r11d, dword ptr [rbx - 24]
+	mov	edx, dword ptr [rbx - 20]
+	shld	eax, edx, 8
+	shld	edx, r11d, 4
+	vmovd	xmm2, r11d
+	vmovd	xmm3, esi
+	vpinsrd	xmm2, xmm2, edx, 1
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm2, xmm2, eax, 2
+	vpinsrd	xmm3, xmm3, r10d, 2
+	vpinsrd	xmm2, xmm2, edi, 3
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vinserti128	ymm2, ymm2, xmm3, 1
+	vpsrlvd	ymm2, ymm2, ymm0
+	vpand	ymm2, ymm2, ymm1
+	vmovdqu	ymmword ptr [r15], ymm2
+	sub	r15, -128
+	add	rbx, 112
+	add	r8, -1
+	jne	.LBB0_104
+	jmp	.LBB0_147
+.LBB0_141:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.142:
+	mov	eax, r14d
+	add	r15, 96
+	xor	ecx, ecx
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_131] # ymm0 = [0,2,4,6,8,10,12,14]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_132] # ymm1 = [12884901891,12884901891,12884901891,12884901891]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_133] # ymm2 = [16,18,20,22,24,26,28,30]
+	.p2align	4, 0x90
+.LBB0_143:                              # =>This Inner Loop Header: Depth=1
+	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx]
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm3
+	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx]
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm3
+	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx + 4]
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm3
+	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx + 4]
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15], ymm3
+	add	rcx, 1
+	sub	r15, -128
+	cmp	rax, rcx
+	jne	.LBB0_143
+	jmp	.LBB0_147
+.LBB0_117:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.118:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 68
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_72] # ymm0 = [0,0,4,0,8,0,12,0]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_73] # ymm1 = [1125895612137471,1125895612137471,1125895612137471,1125895612137471]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_74] # ymm2 = [0,2,0,6,0,10,0,14]
+	.p2align	4, 0x90
+.LBB0_119:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 56]
+	mov	r10d, dword ptr [rbx - 52]
+	shld	r10d, ecx, 2
+	mov	esi, dword ptr [rbx - 60]
+	mov	edi, ecx
+	shld	edi, esi, 6
+	mov	r9d, dword ptr [rbx - 68]
+	mov	edx, dword ptr [rbx - 64]
+	mov	eax, edx
+	shld	eax, r9d, 14
+	vmovd	xmm3, esi
+	shld	esi, edx, 10
+	vpinsrd	xmm3, xmm3, edi, 1
+	vpinsrd	xmm3, xmm3, ecx, 2
+	vpinsrd	xmm3, xmm3, r10d, 3
+	vmovd	xmm4, r9d
+	vpinsrd	xmm4, xmm4, eax, 1
+	vpinsrd	xmm4, xmm4, edx, 2
+	vpinsrd	xmm4, xmm4, esi, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm3
+	mov	r9d, dword ptr [rbx - 36]
+	mov	r11d, dword ptr [rbx - 40]
+	mov	edx, r9d
+	shld	edx, r11d, 4
+	mov	r10d, dword ptr [rbx - 44]
+	mov	edi, r11d
+	shld	edi, r10d, 8
+	mov	eax, dword ptr [rbx - 52]
+	mov	esi, dword ptr [rbx - 48]
+	mov	ecx, r10d
+	shld	ecx, esi, 12
+	shrd	eax, esi, 16
+	vmovd	xmm3, edi
+	vpinsrd	xmm3, xmm3, r11d, 1
+	vpinsrd	xmm3, xmm3, edx, 2
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vmovd	xmm4, eax
+	vpinsrd	xmm4, xmm4, esi, 1
+	vpinsrd	xmm4, xmm4, ecx, 2
+	vpinsrd	xmm4, xmm4, r10d, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm3
+	mov	eax, dword ptr [rbx - 20]
+	mov	r10d, dword ptr [rbx - 16]
+	shld	r10d, eax, 2
+	mov	edx, dword ptr [rbx - 24]
+	mov	esi, eax
+	shld	esi, edx, 6
+	mov	r9d, dword ptr [rbx - 32]
+	mov	ecx, dword ptr [rbx - 28]
+	mov	edi, ecx
+	shld	edi, r9d, 14
+	vmovd	xmm3, edx
+	shld	edx, ecx, 10
+	vpinsrd	xmm3, xmm3, esi, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm3, xmm3, r10d, 3
+	vmovd	xmm4, r9d
+	vpinsrd	xmm4, xmm4, edi, 1
+	vpinsrd	xmm4, xmm4, ecx, 2
+	vpinsrd	xmm4, xmm4, edx, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm3
+	mov	r9d, dword ptr [rbx]
+	mov	r11d, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, r11d, 4
+	mov	r10d, dword ptr [rbx - 8]
+	mov	edi, r11d
+	shld	edi, r10d, 8
+	mov	eax, dword ptr [rbx - 16]
+	mov	esi, dword ptr [rbx - 12]
+	mov	ecx, r10d
+	shld	ecx, esi, 12
+	shrd	eax, esi, 16
+	vmovd	xmm3, edi
+	vpinsrd	xmm3, xmm3, r11d, 1
+	vpinsrd	xmm3, xmm3, edx, 2
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vmovd	xmm4, eax
+	vpinsrd	xmm4, xmm4, esi, 1
+	vpinsrd	xmm4, xmm4, ecx, 2
+	vpinsrd	xmm4, xmm4, r10d, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15], ymm3
+	sub	r15, -128
+	add	rbx, 72
+	add	r8, -1
+	jne	.LBB0_119
+	jmp	.LBB0_147
+.LBB0_129:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.130:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 36
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_101] # ymm0 = [0,10,20,0,8,18,0,6]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_102] # ymm1 = [4393751544831,4393751544831,4393751544831,4393751544831]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_103] # ymm2 = [16,0,4,14,0,2,12,22]
+	.p2align	4, 0x90
+.LBB0_131:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 28]
+	mov	edx, dword ptr [rbx - 36]
+	mov	esi, dword ptr [rbx - 32]
+	mov	edi, ecx
+	shld	edi, esi, 4
+	vmovd	xmm3, esi
+	vpinsrd	xmm3, xmm3, esi, 1
+	shld	esi, edx, 2
+	vpinsrd	xmm3, xmm3, edi, 2
+	vpinsrd	xmm3, xmm3, ecx, 3
+	vmovd	xmm4, edx
+	vpinsrd	xmm4, xmm4, edx, 1
+	vpinsrd	xmm4, xmm4, edx, 2
+	vpinsrd	xmm4, xmm4, esi, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm3
+	mov	ecx, dword ptr [rbx - 20]
+	mov	edx, dword ptr [rbx - 24]
+	mov	esi, ecx
+	shld	esi, edx, 8
+	mov	edi, dword ptr [rbx - 28]
+	mov	eax, edx
+	shld	eax, edi, 6
+	vmovd	xmm3, esi
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, ecx, 2
+	vpinsrd	xmm3, xmm3, ecx, 3
+	vmovd	xmm4, edi
+	vpinsrd	xmm4, xmm4, eax, 1
+	vpinsrd	xmm4, xmm4, edx, 2
+	vpinsrd	xmm4, xmm4, edx, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm3
+	mov	eax, dword ptr [rbx - 8]
+	mov	ecx, dword ptr [rbx - 16]
+	mov	edx, dword ptr [rbx - 12]
+	mov	esi, eax
+	shld	esi, edx, 4
+	vmovd	xmm3, edx
+	vpinsrd	xmm3, xmm3, edx, 1
+	shld	edx, ecx, 2
+	vpinsrd	xmm3, xmm3, esi, 2
+	vpinsrd	xmm3, xmm3, eax, 3
+	vmovd	xmm4, ecx
+	vpinsrd	xmm4, xmm4, ecx, 1
+	vpinsrd	xmm4, xmm4, ecx, 2
+	vpinsrd	xmm4, xmm4, edx, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm3
+	mov	eax, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 8]
+	mov	edx, dword ptr [rbx - 4]
+	mov	esi, eax
+	shld	esi, edx, 8
+	mov	edi, edx
+	shld	edi, ecx, 6
+	vmovd	xmm3, esi
+	vpinsrd	xmm3, xmm3, eax, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm3, xmm3, eax, 3
+	vmovd	xmm4, ecx
+	vpinsrd	xmm4, xmm4, edi, 1
+	vpinsrd	xmm4, xmm4, edx, 2
+	vpinsrd	xmm4, xmm4, edx, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15], ymm3
+	sub	r15, -128
+	add	rbx, 40
+	add	r8, -1
+	jne	.LBB0_131
+	jmp	.LBB0_147
+.LBB0_105:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.106:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 100
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_34] # ymm0 = [288230371923853311,288230371923853311,288230371923853311,288230371923853311]
+	vpbroadcastq	xmm1, qword ptr [rip + .LCPI0_35] # xmm1 = [42949672976,42949672976]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_33] # ymm2 = [0,0,0,0,0,2,0,0]
+	vpbroadcastq	xmm3, qword ptr [rip + .LCPI0_36] # xmm3 = [94489280528,94489280528]
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_37] # ymm4 = [0,0,4,0,0,0,0,6]
+	.p2align	4, 0x90
+.LBB0_107:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 80]
+	mov	r9d, dword ptr [rbx - 76]
+	shld	r9d, ecx, 10
+	mov	r11d, dword ptr [rbx - 84]
+	shld	ecx, r11d, 4
+	mov	edi, dword ptr [rbx - 88]
+	mov	esi, r11d
+	shld	esi, edi, 24
+	mov	edx, dword ptr [rbx - 92]
+	shld	edi, edx, 18
+	mov	r10d, dword ptr [rbx - 100]
+	mov	eax, dword ptr [rbx - 96]
+	shld	edx, eax, 12
+	shld	eax, r10d, 6
+	vmovd	xmm5, r10d
+	vmovd	xmm6, esi
+	vpinsrd	xmm5, xmm5, eax, 1
+	vpinsrd	xmm6, xmm6, r11d, 1
+	vpinsrd	xmm5, xmm5, edx, 2
+	vpinsrd	xmm6, xmm6, ecx, 2
+	vpinsrd	xmm5, xmm5, edi, 3
+	vpinsrd	xmm6, xmm6, r9d, 3
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm5
+	mov	r9d, dword ptr [rbx - 52]
+	mov	ecx, dword ptr [rbx - 56]
+	mov	edx, r9d
+	shld	edx, ecx, 20
+	mov	esi, dword ptr [rbx - 60]
+	shld	ecx, esi, 14
+	mov	edi, dword ptr [rbx - 68]
+	mov	eax, dword ptr [rbx - 64]
+	shld	esi, eax, 8
+	shld	eax, edi, 2
+	vmovq	xmm5, qword ptr [rbx - 76]      # xmm5 = mem[0],zero
+	vpsrlvd	xmm6, xmm5, xmm1
+	vpshufd	xmm5, xmm5, 229                 # xmm5 = xmm5[1,1,2,3]
+	vpinsrd	xmm5, xmm5, edi, 1
+	vpsllvd	xmm5, xmm5, xmm3
+	vpor	xmm5, xmm6, xmm5
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, r9d, 3
+	vpinsrd	xmm5, xmm5, edi, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm5
+	mov	eax, dword ptr [rbx - 28]
+	mov	r9d, dword ptr [rbx - 24]
+	shld	r9d, eax, 10
+	mov	r11d, dword ptr [rbx - 32]
+	shld	eax, r11d, 4
+	mov	esi, dword ptr [rbx - 36]
+	mov	edi, r11d
+	shld	edi, esi, 24
+	mov	ecx, dword ptr [rbx - 40]
+	shld	esi, ecx, 18
+	mov	r10d, dword ptr [rbx - 48]
+	mov	edx, dword ptr [rbx - 44]
+	shld	ecx, edx, 12
+	shld	edx, r10d, 6
+	vmovd	xmm5, r10d
+	vmovd	xmm6, edi
+	vpinsrd	xmm5, xmm5, edx, 1
+	vpinsrd	xmm6, xmm6, r11d, 1
+	vpinsrd	xmm5, xmm5, ecx, 2
+	vpinsrd	xmm6, xmm6, eax, 2
+	vpinsrd	xmm5, xmm5, esi, 3
+	vpinsrd	xmm6, xmm6, r9d, 3
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm2
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm5
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, ecx, 20
+	mov	esi, dword ptr [rbx - 8]
+	shld	ecx, esi, 14
+	mov	edi, dword ptr [rbx - 16]
+	mov	eax, dword ptr [rbx - 12]
+	shld	esi, eax, 8
+	shld	eax, edi, 2
+	vmovq	xmm5, qword ptr [rbx - 24]      # xmm5 = mem[0],zero
+	vpsrlvd	xmm6, xmm5, xmm1
+	vpshufd	xmm5, xmm5, 229                 # xmm5 = xmm5[1,1,2,3]
+	vpinsrd	xmm5, xmm5, edi, 1
+	vpsllvd	xmm5, xmm5, xmm3
+	vpor	xmm5, xmm6, xmm5
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, r9d, 3
+	vpinsrd	xmm5, xmm5, edi, 2
+	vpinsrd	xmm5, xmm5, eax, 3
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vpsrlvd	ymm5, ymm5, ymm4
+	vpand	ymm5, ymm5, ymm0
+	vmovdqu	ymmword ptr [r15], ymm5
+	sub	r15, -128
+	add	rbx, 104
+	add	r8, -1
+	jne	.LBB0_107
+	jmp	.LBB0_147
+.LBB0_135:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.136:
+	mov	eax, r14d
+	add	r15, 96
+	add	rbx, 20
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_116] # ymm0 = [0,6,12,18,24,0,4,10]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_117] # ymm1 = [270582939711,270582939711,270582939711,270582939711]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_118] # ymm2 = [16,22,0,2,8,14,20,26]
+	.p2align	4, 0x90
+.LBB0_137:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rbx - 20]
+	mov	edx, dword ptr [rbx - 16]
+	mov	esi, edx
+	shld	esi, ecx, 2
+	vmovd	xmm3, ecx
+	vpbroadcastd	xmm4, xmm3
+	vpinsrd	xmm3, xmm3, esi, 1
+	vpinsrd	xmm3, xmm3, edx, 2
+	vpinsrd	xmm3, xmm3, edx, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm3
+	mov	ecx, dword ptr [rbx - 16]
+	mov	edx, dword ptr [rbx - 12]
+	mov	esi, edx
+	shld	esi, ecx, 4
+	vmovd	xmm3, ecx
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, esi, 2
+	vpinsrd	xmm3, xmm3, edx, 3
+	vmovd	xmm4, edx
+	vpbroadcastd	xmm4, xmm4
+	vinserti128	ymm3, ymm3, xmm4, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm3
+	mov	ecx, dword ptr [rbx - 8]
+	mov	edx, dword ptr [rbx - 4]
+	mov	esi, edx
+	shld	esi, ecx, 2
+	vmovd	xmm3, ecx
+	vpinsrd	xmm4, xmm3, esi, 1
+	vpinsrd	xmm4, xmm4, edx, 2
+	vpbroadcastd	xmm3, xmm3
+	vpinsrd	xmm4, xmm4, edx, 3
+	vinserti128	ymm3, ymm3, xmm4, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm3
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, dword ptr [rbx]
+	mov	esi, edx
+	shld	esi, ecx, 4
+	vmovd	xmm3, ecx
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, esi, 2
+	vpinsrd	xmm3, xmm3, edx, 3
+	vmovd	xmm4, edx
+	vpbroadcastd	xmm4, xmm4
+	vinserti128	ymm3, ymm3, xmm4, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15], ymm3
+	sub	r15, -128
+	add	rbx, 24
+	add	rax, -1
+	jne	.LBB0_137
+	jmp	.LBB0_147
+.LBB0_111:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.112:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 84
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_55] # ymm0 = [0,0,0,2,0,0,4,0]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_56] # ymm1 = [18014394218708991,18014394218708991,18014394218708991,18014394218708991]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_57] # ymm2 = [0,6,0,0,8,0,0,10]
+	.p2align	4, 0x90
+.LBB0_113:                              # =>This Inner Loop Header: Depth=1
+	mov	r10d, dword ptr [rbx - 68]
+	mov	r9d, dword ptr [rbx - 64]
+	shld	r9d, r10d, 6
+	mov	esi, dword ptr [rbx - 72]
+	mov	edi, r10d
+	shld	edi, esi, 18
+	mov	edx, dword ptr [rbx - 76]
+	shld	esi, edx, 8
+	mov	r11d, dword ptr [rbx - 84]
+	mov	ecx, dword ptr [rbx - 80]
+	mov	eax, edx
+	shld	eax, ecx, 20
+	shld	ecx, r11d, 10
+	vmovd	xmm3, r11d
+	vmovd	xmm4, esi
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm4, xmm4, edi, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm4, xmm4, r10d, 2
+	vpinsrd	xmm3, xmm3, edx, 3
+	vpinsrd	xmm4, xmm4, r9d, 3
+	vinserti128	ymm3, ymm3, xmm4, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm3
+	mov	r9d, dword ptr [rbx - 44]
+	mov	ecx, dword ptr [rbx - 48]
+	mov	r10d, r9d
+	shld	r10d, ecx, 12
+	mov	esi, dword ptr [rbx - 52]
+	shld	ecx, esi, 2
+	mov	edi, dword ptr [rbx - 56]
+	vmovd	xmm3, esi
+	shld	esi, edi, 14
+	mov	eax, dword ptr [rbx - 64]
+	mov	edx, dword ptr [rbx - 60]
+	shld	edi, edx, 4
+	shrd	eax, edx, 16
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vmovd	xmm4, eax
+	vpinsrd	xmm3, xmm3, r10d, 2
+	vpinsrd	xmm4, xmm4, edx, 1
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vpinsrd	xmm4, xmm4, edi, 2
+	vpinsrd	xmm4, xmm4, esi, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm3
+	mov	r10d, dword ptr [rbx - 24]
+	mov	r9d, dword ptr [rbx - 20]
+	shld	r9d, r10d, 6
+	mov	edx, dword ptr [rbx - 28]
+	mov	esi, r10d
+	shld	esi, edx, 18
+	mov	ecx, dword ptr [rbx - 32]
+	shld	edx, ecx, 8
+	mov	r11d, dword ptr [rbx - 40]
+	mov	eax, dword ptr [rbx - 36]
+	mov	edi, ecx
+	shld	edi, eax, 20
+	shld	eax, r11d, 10
+	vmovd	xmm3, r11d
+	vmovd	xmm4, edx
+	vpinsrd	xmm3, xmm3, eax, 1
+	vpinsrd	xmm4, xmm4, esi, 1
+	vpinsrd	xmm3, xmm3, edi, 2
+	vpinsrd	xmm4, xmm4, r10d, 2
+	vpinsrd	xmm3, xmm3, ecx, 3
+	vpinsrd	xmm4, xmm4, r9d, 3
+	vinserti128	ymm3, ymm3, xmm4, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm3
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	r10d, r9d
+	shld	r10d, ecx, 12
+	mov	esi, dword ptr [rbx - 8]
+	shld	ecx, esi, 2
+	mov	edi, dword ptr [rbx - 12]
+	vmovd	xmm3, esi
+	shld	esi, edi, 14
+	mov	eax, dword ptr [rbx - 20]
+	mov	edx, dword ptr [rbx - 16]
+	shld	edi, edx, 4
+	shrd	eax, edx, 16
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vmovd	xmm4, eax
+	vpinsrd	xmm3, xmm3, r10d, 2
+	vpinsrd	xmm4, xmm4, edx, 1
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vpinsrd	xmm4, xmm4, edi, 2
+	vpinsrd	xmm4, xmm4, esi, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15], ymm3
+	sub	r15, -128
+	add	rbx, 88
+	add	r8, -1
+	jne	.LBB0_113
+	jmp	.LBB0_147
+.LBB0_123:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.124:
+	mov	r8d, r14d
+	add	r15, 96
+	add	rbx, 52
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_86] # ymm0 = [0,14,0,10,0,6,0,2]
+	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_87] # ymm1 = [70364449226751,70364449226751,70364449226751,70364449226751]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_88] # ymm2 = [16,0,12,0,8,0,4,18]
+	.p2align	4, 0x90
+.LBB0_125:                              # =>This Inner Loop Header: Depth=1
+	mov	r9d, dword ptr [rbx - 40]
+	mov	ecx, dword ptr [rbx - 44]
+	mov	esi, r9d
+	shld	esi, ecx, 12
+	mov	edi, dword ptr [rbx - 52]
+	mov	r10d, dword ptr [rbx - 48]
+	mov	edx, ecx
+	shld	edx, r10d, 8
+	mov	eax, r10d
+	shld	eax, edi, 4
+	vmovd	xmm3, edx
+	vpinsrd	xmm3, xmm3, ecx, 1
+	vpinsrd	xmm3, xmm3, esi, 2
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vmovd	xmm4, edi
+	vpinsrd	xmm4, xmm4, edi, 1
+	vpinsrd	xmm4, xmm4, eax, 2
+	vpinsrd	xmm4, xmm4, r10d, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 96], ymm3
+	mov	eax, dword ptr [rbx - 28]
+	mov	ecx, dword ptr [rbx - 32]
+	mov	edx, eax
+	shld	edx, ecx, 10
+	mov	r9d, dword ptr [rbx - 40]
+	mov	esi, dword ptr [rbx - 36]
+	vmovd	xmm3, ecx
+	shld	ecx, esi, 6
+	mov	edi, esi
+	shld	edi, r9d, 2
+	vmovd	xmm4, r9d
+	vpinsrd	xmm4, xmm4, edi, 1
+	vpinsrd	xmm4, xmm4, esi, 2
+	vpinsrd	xmm4, xmm4, ecx, 3
+	vpinsrd	xmm3, xmm3, edx, 1
+	vpinsrd	xmm3, xmm3, eax, 2
+	vpinsrd	xmm3, xmm3, eax, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 64], ymm3
+	mov	r9d, dword ptr [rbx - 12]
+	mov	eax, dword ptr [rbx - 16]
+	mov	edx, r9d
+	shld	edx, eax, 12
+	mov	esi, dword ptr [rbx - 24]
+	mov	r10d, dword ptr [rbx - 20]
+	mov	ecx, eax
+	shld	ecx, r10d, 8
+	mov	edi, r10d
+	shld	edi, esi, 4
+	vmovd	xmm3, ecx
+	vpinsrd	xmm3, xmm3, eax, 1
+	vpinsrd	xmm3, xmm3, edx, 2
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vmovd	xmm4, esi
+	vpinsrd	xmm4, xmm4, esi, 1
+	vpinsrd	xmm4, xmm4, edi, 2
+	vpinsrd	xmm4, xmm4, r10d, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm0
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15 - 32], ymm3
+	mov	r9d, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, r9d
+	shld	edx, ecx, 10
+	mov	eax, dword ptr [rbx - 8]
+	vmovd	xmm3, ecx
+	shld	ecx, eax, 6
+	mov	edi, dword ptr [rbx - 12]
+	mov	esi, eax
+	shld	esi, edi, 2
+	vmovd	xmm4, edi
+	vpinsrd	xmm4, xmm4, esi, 1
+	vpinsrd	xmm4, xmm4, eax, 2
+	vpinsrd	xmm4, xmm4, ecx, 3
+	vpinsrd	xmm3, xmm3, edx, 1
+	vpinsrd	xmm3, xmm3, r9d, 2
+	vpinsrd	xmm3, xmm3, r9d, 3
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlvd	ymm3, ymm3, ymm2
+	vpand	ymm3, ymm3, ymm1
+	vmovdqu	ymmword ptr [r15], ymm3
+	sub	r15, -128
+	add	rbx, 56
+	add	r8, -1
+	jne	.LBB0_125
+	jmp	.LBB0_147
+.LBB0_99:
+	cmp	edx, 32
+	jl	.LBB0_147
+# %bb.100:
+	mov	r8d, r14d
+	add	r15, 96
+	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_8] # ymm0 = [4611686015206162431,4611686015206162431,4611686015206162431,4611686015206162431]
+	add	rbx, 116
+	vmovdqa	xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = [16,14,12,10]
+	vmovdqa	xmm2, xmmword ptr [rip + .LCPI0_10] # xmm2 = [16,18,20,22]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_11] # ymm3 = [0,0,0,0,0,0,0,2]
+	.p2align	4, 0x90
+.LBB0_101:                              # =>This Inner Loop Header: Depth=1
+	mov	r11d, dword ptr [rbx - 92]
+	mov	r9d, dword ptr [rbx - 88]
+	shld	r9d, r11d, 14
+	mov	esi, dword ptr [rbx - 96]
+	shld	r11d, esi, 12
+	mov	edi, dword ptr [rbx - 100]
+	shld	esi, edi, 10
+	mov	eax, dword ptr [rbx - 104]
+	shld	edi, eax, 8
+	mov	edx, dword ptr [rbx - 108]
+	shld	eax, edx, 6
+	mov	r10d, dword ptr [rbx - 116]
+	mov	ecx, dword ptr [rbx - 112]
+	shld	edx, ecx, 4
+	shld	ecx, r10d, 2
+	vmovd	xmm4, r10d
+	vmovd	xmm5, edi
+	vpinsrd	xmm4, xmm4, ecx, 1
+	vpinsrd	xmm5, xmm5, esi, 1
+	vpinsrd	xmm4, xmm4, edx, 2
+	vpinsrd	xmm5, xmm5, r11d, 2
+	vpinsrd	xmm4, xmm4, eax, 3
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vinserti128	ymm4, ymm4, xmm5, 1
+	vpand	ymm4, ymm4, ymm0
+	vmovdqu	ymmword ptr [r15 - 96], ymm4
+	mov	eax, dword ptr [rbx - 60]
+	mov	ecx, dword ptr [rbx - 64]
+	mov	edx, eax
+	shld	edx, ecx, 28
+	mov	esi, dword ptr [rbx - 68]
+	mov	edi, dword ptr [rbx - 72]
+	shld	ecx, esi, 26
+	shld	esi, edi, 24
+	vmovdqu	xmm4, xmmword ptr [rbx - 88]
+	vpsrlvd	xmm5, xmm4, xmm1
+	vpshufd	xmm4, xmm4, 249                 # xmm4 = xmm4[1,2,3,3]
+	vpinsrd	xmm4, xmm4, edi, 3
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpsllvd	xmm4, xmm4, xmm2
+	vpinsrd	xmm6, xmm6, eax, 3
+	vpor	xmm4, xmm5, xmm4
+	vinserti128	ymm4, ymm4, xmm6, 1
+	vpsrlvd	ymm4, ymm4, ymm3
+	vpand	ymm4, ymm4, ymm0
+	vmovdqu	ymmword ptr [r15 - 64], ymm4
+	mov	r11d, dword ptr [rbx - 32]
+	mov	r9d, dword ptr [rbx - 28]
+	shld	r9d, r11d, 14
+	mov	edx, dword ptr [rbx - 36]
+	shld	r11d, edx, 12
+	mov	esi, dword ptr [rbx - 40]
+	shld	edx, esi, 10
+	mov	edi, dword ptr [rbx - 44]
+	shld	esi, edi, 8
+	mov	ecx, dword ptr [rbx - 48]
+	shld	edi, ecx, 6
+	mov	r10d, dword ptr [rbx - 56]
+	mov	eax, dword ptr [rbx - 52]
+	shld	ecx, eax, 4
+	shld	eax, r10d, 2
+	vmovd	xmm4, r10d
+	vmovd	xmm5, esi
+	vpinsrd	xmm4, xmm4, eax, 1
+	vpinsrd	xmm5, xmm5, edx, 1
+	vpinsrd	xmm4, xmm4, ecx, 2
+	vpinsrd	xmm5, xmm5, r11d, 2
+	vpinsrd	xmm4, xmm4, edi, 3
+	vpinsrd	xmm5, xmm5, r9d, 3
+	vinserti128	ymm4, ymm4, xmm5, 1
+	vpand	ymm4, ymm4, ymm0
+	vmovdqu	ymmword ptr [r15 - 32], ymm4
+	mov	eax, dword ptr [rbx]
+	mov	ecx, dword ptr [rbx - 4]
+	mov	edx, eax
+	shld	edx, ecx, 28
+	mov	esi, dword ptr [rbx - 8]
+	shld	ecx, esi, 26
+	mov	edi, dword ptr [rbx - 12]
+	vmovdqu	xmm4, xmmword ptr [rbx - 28]
+	shld	esi, edi, 24
+	vpsrlvd	xmm5, xmm4, xmm1
+	vpshufd	xmm4, xmm4, 249                 # xmm4 = xmm4[1,2,3,3]
+	vpinsrd	xmm4, xmm4, edi, 3
+	vmovd	xmm6, esi
+	vpinsrd	xmm6, xmm6, ecx, 1
+	vpsllvd	xmm4, xmm4, xmm2
+	vpinsrd	xmm6, xmm6, edx, 2
+	vpinsrd	xmm6, xmm6, eax, 3
+	vpor	xmm4, xmm5, xmm4
+	vinserti128	ymm4, ymm4, xmm6, 1
+	vpsrlvd	ymm4, ymm4, ymm3
+	vpand	ymm4, ymm4, ymm0
+	vmovdqu	ymmword ptr [r15], ymm4
+	sub	r15, -128
+	add	rbx, 120
+	add	r8, -1
+	jne	.LBB0_101
+.LBB0_147:
+	shl	r14d, 5
+	mov	eax, r14d
+	lea	rsp, [rbp - 32]
+	pop	rbx
+	pop	r12
+	pop	r14
+	pop	r15
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end0:
+	.size	unpack32_avx2, .Lfunc_end0-unpack32_avx2
+                                        # -- End function
+	.ident	"Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
diff --git a/go/parquet/internal/utils/_lib/min_max.c b/go/parquet/internal/utils/_lib/min_max.c
new file mode 100644
index 00000000000..83c189fc24d
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/min_max.c
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arch.h>
+#include <stdint.h>
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+
+void FULL_NAME(int32_max_min)(int32_t values[], int len, int32_t* minout, int32_t* maxout) {
+  int32_t max = INT32_MIN;
+  int32_t min = INT32_MAX;
+
+  for (int i = 0; i < len; ++i) {
+    min = min < values[i] ? min : values[i];
+    max = max > values[i] ? max : values[i];
+  }
+
+  *maxout = max;
+  *minout = min;
+}
+
+void FULL_NAME(uint32_max_min)(uint32_t values[], int len, uint32_t* minout, uint32_t* maxout) {
+  uint32_t max = 0;
+  uint32_t min = UINT32_MAX;
+
+  for (int i = 0; i < len; ++i) {
+    min = min < values[i] ? min : values[i];
+    max = max > values[i] ? max : values[i];
+  }
+
+  *maxout = max;
+  *minout = min;
+}
+
+void FULL_NAME(int64_max_min)(int64_t values[], int len, int64_t* minout, int64_t* maxout) {
+  int64_t max = INT64_MIN;
+  int64_t min = INT64_MAX;
+
+  for (int i = 0; i < len; ++i) {
+    min = min < values[i] ? min : values[i];
+    max = max > values[i] ? max : values[i];
+  }
+
+  *maxout = max;
+  *minout = min;
+}
+
+void FULL_NAME(uint64_max_min)(uint64_t values[], int len, uint64_t* minout, uint64_t* maxout) {
+  uint64_t max = 0;
+  uint64_t min = UINT64_MAX;
+
+  for (int i = 0; i < len; ++i) {
+    min = min < values[i] ? min : values[i];
+    max = max > values[i] ? max : values[i];
+  }
+
+  *maxout = max;
+  *minout = min;
+}
diff --git a/go/parquet/internal/utils/_lib/min_max_avx2.s b/go/parquet/internal/utils/_lib/min_max_avx2.s
new file mode 100644
index 00000000000..dbf9a895ae3
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/min_max_avx2.s
@@ -0,0 +1,1366 @@
+	.text
+	.intel_syntax noprefix
+	.file	"min_max.c"
+	.section	.rodata.cst4,"aM",@progbits,4
+	.p2align	2                               # -- Begin function int32_max_min_avx2
+.LCPI0_0:
+	.long	2147483648                      # 0x80000000
+.LCPI0_1:
+	.long	2147483647                      # 0x7fffffff
+	.text
+	.globl	int32_max_min_avx2
+	.p2align	4, 0x90
+	.type	int32_max_min_avx2,@function
+int32_max_min_avx2:                     # @int32_max_min_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -32
+	sub	rsp, 64
+	test	esi, esi
+	jle	.LBB0_1
+# %bb.2:
+	mov	r8d, esi
+	cmp	esi, 31
+	ja	.LBB0_6
+# %bb.3:
+	mov	eax, -2147483648
+	mov	r9d, 2147483647
+	xor	r11d, r11d
+	jmp	.LBB0_4
+.LBB0_1:
+	mov	r9d, 2147483647
+	mov	eax, -2147483648
+	jmp	.LBB0_14
+.LBB0_6:
+	mov	r11d, r8d
+	and	r11d, -32
+	lea	rax, [r11 - 32]
+	mov	r10, rax
+	shr	r10, 5
+	add	r10, 1
+	mov	r9d, r10d
+	and	r9d, 3
+	cmp	rax, 96
+	jae	.LBB0_8
+# %bb.7:
+	vpbroadcastd	ymm0, dword ptr [rip + .LCPI0_0] # ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_1] # ymm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+	xor	eax, eax
+	vmovdqa	ymm2, ymm1
+	vmovdqa	ymm4, ymm1
+	vmovdqa	ymm6, ymm1
+	vmovdqa	ymm3, ymm0
+	vmovdqa	ymm5, ymm0
+	vmovdqa	ymm7, ymm0
+	jmp	.LBB0_10
+.LBB0_8:
+	and	r10, -4
+	vpbroadcastd	ymm0, dword ptr [rip + .LCPI0_0] # ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+	neg	r10
+	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_1] # ymm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+	xor	eax, eax
+	vmovdqa	ymm2, ymm1
+	vmovdqa	ymm4, ymm1
+	vmovdqa	ymm6, ymm1
+	vmovdqa	ymm3, ymm0
+	vmovdqa	ymm5, ymm0
+	vmovdqa	ymm7, ymm0
+	.p2align	4, 0x90
+.LBB0_9:                                # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax]
+	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 32]
+	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 64]
+	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 96]
+	vpminsd	ymm6, ymm6, ymm11
+	vpminsd	ymm4, ymm4, ymm10
+	vpminsd	ymm1, ymm1, ymm8
+	vpminsd	ymm2, ymm2, ymm9
+	vpmaxsd	ymm7, ymm7, ymm11
+	vpmaxsd	ymm5, ymm5, ymm10
+	vpmaxsd	ymm0, ymm0, ymm8
+	vpmaxsd	ymm3, ymm3, ymm9
+	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax + 224]
+	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 192]
+	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 128]
+	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 160]
+	vmovdqu	ymm12, ymmword ptr [rdi + 4*rax + 256]
+	vmovdqu	ymm13, ymmword ptr [rdi + 4*rax + 320]
+	vmovdqu	ymm14, ymmword ptr [rdi + 4*rax + 352]
+	vpminsd	ymm15, ymm8, ymm14
+	vpminsd	ymm6, ymm6, ymm15
+	vmovdqa	ymmword ptr [rsp], ymm6         # 32-byte Spill
+	vpminsd	ymm15, ymm9, ymm13
+	vpminsd	ymm4, ymm4, ymm15
+	vpminsd	ymm15, ymm10, ymm12
+	vpminsd	ymm1, ymm1, ymm15
+	vmovdqu	ymm15, ymmword ptr [rdi + 4*rax + 288]
+	vpminsd	ymm6, ymm11, ymm15
+	vpminsd	ymm2, ymm2, ymm6
+	vpmaxsd	ymm6, ymm8, ymm14
+	vpmaxsd	ymm7, ymm7, ymm6
+	vpmaxsd	ymm6, ymm9, ymm13
+	vpmaxsd	ymm5, ymm5, ymm6
+	vpmaxsd	ymm6, ymm10, ymm12
+	vpmaxsd	ymm0, ymm0, ymm6
+	vpmaxsd	ymm6, ymm11, ymm15
+	vpmaxsd	ymm3, ymm3, ymm6
+	vmovdqu	ymm6, ymmword ptr [rdi + 4*rax + 416]
+	vpminsd	ymm2, ymm2, ymm6
+	vpmaxsd	ymm3, ymm3, ymm6
+	vmovdqu	ymm6, ymmword ptr [rdi + 4*rax + 384]
+	vpminsd	ymm1, ymm1, ymm6
+	vpmaxsd	ymm0, ymm0, ymm6
+	vmovdqu	ymm6, ymmword ptr [rdi + 4*rax + 448]
+	vpminsd	ymm4, ymm4, ymm6
+	vpmaxsd	ymm5, ymm5, ymm6
+	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax + 480]
+	vpminsd	ymm6, ymm8, ymmword ptr [rsp]   # 32-byte Folded Reload
+	vpmaxsd	ymm7, ymm7, ymm8
+	sub	rax, -128
+	add	r10, 4
+	jne	.LBB0_9
+.LBB0_10:
+	test	r9, r9
+	je	.LBB0_13
+# %bb.11:
+	lea	rax, [rdi + 4*rax]
+	neg	r9
+	.p2align	4, 0x90
+.LBB0_12:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm8, ymmword ptr [rax]
+	vmovdqu	ymm9, ymmword ptr [rax + 32]
+	vmovdqu	ymm10, ymmword ptr [rax + 64]
+	vmovdqu	ymm11, ymmword ptr [rax + 96]
+	vpminsd	ymm2, ymm2, ymm9
+	vpminsd	ymm1, ymm1, ymm8
+	vpminsd	ymm4, ymm4, ymm10
+	vpminsd	ymm6, ymm6, ymm11
+	vpmaxsd	ymm3, ymm3, ymm9
+	vpmaxsd	ymm0, ymm0, ymm8
+	vpmaxsd	ymm5, ymm5, ymm10
+	vpmaxsd	ymm7, ymm7, ymm11
+	sub	rax, -128
+	inc	r9
+	jne	.LBB0_12
+.LBB0_13:
+	vpminsd	ymm2, ymm2, ymm6
+	vpminsd	ymm1, ymm1, ymm4
+	vpminsd	ymm1, ymm1, ymm2
+	vpmaxsd	ymm2, ymm3, ymm7
+	vpmaxsd	ymm0, ymm0, ymm5
+	vpmaxsd	ymm0, ymm0, ymm2
+	vextracti128	xmm2, ymm0, 1
+	vpmaxsd	xmm0, xmm0, xmm2
+	vpshufd	xmm2, xmm0, 78                  # xmm2 = xmm0[2,3,0,1]
+	vpmaxsd	xmm0, xmm0, xmm2
+	vpshufd	xmm2, xmm0, 229                 # xmm2 = xmm0[1,1,2,3]
+	vpmaxsd	xmm0, xmm0, xmm2
+	vmovd	eax, xmm0
+	vextracti128	xmm0, ymm1, 1
+	vpminsd	xmm0, xmm1, xmm0
+	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
+	vpminsd	xmm0, xmm0, xmm1
+	vpshufd	xmm1, xmm0, 229                 # xmm1 = xmm0[1,1,2,3]
+	vpminsd	xmm0, xmm0, xmm1
+	vmovd	r9d, xmm0
+	cmp	r11, r8
+	je	.LBB0_14
+.LBB0_4:
+	mov	esi, eax
+	.p2align	4, 0x90
+.LBB0_5:                                # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rdi + 4*r11]
+	cmp	r9d, eax
+	cmovg	r9d, eax
+	cmp	esi, eax
+	cmovge	eax, esi
+	add	r11, 1
+	mov	esi, eax
+	cmp	r8, r11
+	jne	.LBB0_5
+.LBB0_14:
+	mov	dword ptr [rcx], eax
+	mov	dword ptr [rdx], r9d
+	mov	rsp, rbp
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end0:
+	.size	int32_max_min_avx2, .Lfunc_end0-int32_max_min_avx2
+                                        # -- End function
+	.globl	uint32_max_min_avx2             # -- Begin function uint32_max_min_avx2
+	.p2align	4, 0x90
+	.type	uint32_max_min_avx2,@function
+uint32_max_min_avx2:                    # @uint32_max_min_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -32
+	sub	rsp, 64
+	test	esi, esi
+	jle	.LBB1_1
+# %bb.2:
+	mov	r8d, esi
+	cmp	esi, 31
+	ja	.LBB1_6
+# %bb.3:
+	xor	r11d, r11d
+	mov	r9d, -1
+	xor	esi, esi
+	jmp	.LBB1_4
+.LBB1_1:
+	mov	r9d, -1
+	xor	esi, esi
+	jmp	.LBB1_14
+.LBB1_6:
+	mov	r11d, r8d
+	and	r11d, -32
+	lea	rax, [r11 - 32]
+	mov	r10, rax
+	shr	r10, 5
+	add	r10, 1
+	mov	r9d, r10d
+	and	r9d, 3
+	cmp	rax, 96
+	jae	.LBB1_8
+# %bb.7:
+	vpxor	xmm0, xmm0, xmm0
+	vpcmpeqd	ymm1, ymm1, ymm1
+	xor	eax, eax
+	vpcmpeqd	ymm2, ymm2, ymm2
+	vpcmpeqd	ymm4, ymm4, ymm4
+	vpcmpeqd	ymm6, ymm6, ymm6
+	vpxor	xmm3, xmm3, xmm3
+	vpxor	xmm5, xmm5, xmm5
+	vpxor	xmm7, xmm7, xmm7
+	jmp	.LBB1_10
+.LBB1_8:
+	and	r10, -4
+	neg	r10
+	vpxor	xmm0, xmm0, xmm0
+	vpcmpeqd	ymm1, ymm1, ymm1
+	xor	eax, eax
+	vpcmpeqd	ymm2, ymm2, ymm2
+	vpcmpeqd	ymm4, ymm4, ymm4
+	vpcmpeqd	ymm6, ymm6, ymm6
+	vpxor	xmm3, xmm3, xmm3
+	vpxor	xmm5, xmm5, xmm5
+	vpxor	xmm7, xmm7, xmm7
+	.p2align	4, 0x90
+.LBB1_9:                                # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax]
+	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 32]
+	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 64]
+	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 96]
+	vpminud	ymm6, ymm6, ymm11
+	vpminud	ymm4, ymm4, ymm10
+	vpminud	ymm1, ymm1, ymm8
+	vpminud	ymm2, ymm2, ymm9
+	vpmaxud	ymm7, ymm7, ymm11
+	vpmaxud	ymm5, ymm5, ymm10
+	vpmaxud	ymm0, ymm0, ymm8
+	vpmaxud	ymm3, ymm3, ymm9
+	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax + 224]
+	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 192]
+	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 128]
+	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 160]
+	vmovdqu	ymm12, ymmword ptr [rdi + 4*rax + 256]
+	vmovdqu	ymm13, ymmword ptr [rdi + 4*rax + 320]
+	vmovdqu	ymm14, ymmword ptr [rdi + 4*rax + 352]
+	vpminud	ymm15, ymm8, ymm14
+	vpminud	ymm6, ymm6, ymm15
+	vmovdqa	ymmword ptr [rsp], ymm6         # 32-byte Spill
+	vpminud	ymm15, ymm9, ymm13
+	vpminud	ymm4, ymm4, ymm15
+	vpminud	ymm15, ymm10, ymm12
+	vpminud	ymm1, ymm1, ymm15
+	vmovdqu	ymm15, ymmword ptr [rdi + 4*rax + 288]
+	vpminud	ymm6, ymm11, ymm15
+	vpminud	ymm2, ymm2, ymm6
+	vpmaxud	ymm6, ymm8, ymm14
+	vpmaxud	ymm7, ymm7, ymm6
+	vpmaxud	ymm6, ymm9, ymm13
+	vpmaxud	ymm5, ymm5, ymm6
+	vpmaxud	ymm6, ymm10, ymm12
+	vpmaxud	ymm0, ymm0, ymm6
+	vpmaxud	ymm6, ymm11, ymm15
+	vpmaxud	ymm3, ymm3, ymm6
+	vmovdqu	ymm6, ymmword ptr [rdi + 4*rax + 416]
+	vpminud	ymm2, ymm2, ymm6
+	vpmaxud	ymm3, ymm3, ymm6
+	vmovdqu	ymm6, ymmword ptr [rdi + 4*rax + 384]
+	vpminud	ymm1, ymm1, ymm6
+	vpmaxud	ymm0, ymm0, ymm6
+	vmovdqu	ymm6, ymmword ptr [rdi + 4*rax + 448]
+	vpminud	ymm4, ymm4, ymm6
+	vpmaxud	ymm5, ymm5, ymm6
+	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax + 480]
+	vpminud	ymm6, ymm8, ymmword ptr [rsp]   # 32-byte Folded Reload
+	vpmaxud	ymm7, ymm7, ymm8
+	sub	rax, -128
+	add	r10, 4
+	jne	.LBB1_9
+.LBB1_10:
+	test	r9, r9
+	je	.LBB1_13
+# %bb.11:
+	lea	rax, [rdi + 4*rax]
+	neg	r9
+	.p2align	4, 0x90
+.LBB1_12:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm8, ymmword ptr [rax]
+	vmovdqu	ymm9, ymmword ptr [rax + 32]
+	vmovdqu	ymm10, ymmword ptr [rax + 64]
+	vmovdqu	ymm11, ymmword ptr [rax + 96]
+	vpminud	ymm2, ymm2, ymm9
+	vpminud	ymm1, ymm1, ymm8
+	vpminud	ymm4, ymm4, ymm10
+	vpminud	ymm6, ymm6, ymm11
+	vpmaxud	ymm3, ymm3, ymm9
+	vpmaxud	ymm0, ymm0, ymm8
+	vpmaxud	ymm5, ymm5, ymm10
+	vpmaxud	ymm7, ymm7, ymm11
+	sub	rax, -128
+	inc	r9
+	jne	.LBB1_12
+.LBB1_13:
+	vpminud	ymm2, ymm2, ymm6
+	vpminud	ymm1, ymm1, ymm4
+	vpminud	ymm1, ymm1, ymm2
+	vpmaxud	ymm2, ymm3, ymm7
+	vpmaxud	ymm0, ymm0, ymm5
+	vpmaxud	ymm0, ymm0, ymm2
+	vextracti128	xmm2, ymm0, 1
+	vpmaxud	xmm0, xmm0, xmm2
+	vpshufd	xmm2, xmm0, 78                  # xmm2 = xmm0[2,3,0,1]
+	vpmaxud	xmm0, xmm0, xmm2
+	vpshufd	xmm2, xmm0, 229                 # xmm2 = xmm0[1,1,2,3]
+	vpmaxud	xmm0, xmm0, xmm2
+	vmovd	esi, xmm0
+	vextracti128	xmm0, ymm1, 1
+	vpminud	xmm0, xmm1, xmm0
+	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
+	vpminud	xmm0, xmm0, xmm1
+	vpshufd	xmm1, xmm0, 229                 # xmm1 = xmm0[1,1,2,3]
+	vpminud	xmm0, xmm0, xmm1
+	vmovd	r9d, xmm0
+	cmp	r11, r8
+	je	.LBB1_14
+.LBB1_4:
+	mov	eax, esi
+	.p2align	4, 0x90
+.LBB1_5:                                # =>This Inner Loop Header: Depth=1
+	mov	esi, dword ptr [rdi + 4*r11]
+	cmp	r9d, esi
+	cmovae	r9d, esi
+	cmp	eax, esi
+	cmova	esi, eax
+	add	r11, 1
+	mov	eax, esi
+	cmp	r8, r11
+	jne	.LBB1_5
+.LBB1_14:
+	mov	dword ptr [rcx], esi
+	mov	dword ptr [rdx], r9d
+	mov	rsp, rbp
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end1:
+	.size	uint32_max_min_avx2, .Lfunc_end1-uint32_max_min_avx2
+                                        # -- End function
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3                               # -- Begin function int64_max_min_avx2
+.LCPI2_0:
+	.quad	-9223372036854775808            # 0x8000000000000000
+.LCPI2_1:
+	.quad	9223372036854775807             # 0x7fffffffffffffff
+	.text
+	.globl	int64_max_min_avx2
+	.p2align	4, 0x90
+	.type	int64_max_min_avx2,@function
+int64_max_min_avx2:                     # @int64_max_min_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -32
+	sub	rsp, 224
+	movabs	r9, 9223372036854775807
+	test	esi, esi
+	jle	.LBB2_1
+# %bb.2:
+	mov	r8d, esi
+	cmp	esi, 31
+	ja	.LBB2_6
+# %bb.3:
+	lea	rsi, [r9 + 1]
+	xor	r11d, r11d
+	jmp	.LBB2_4
+.LBB2_1:
+	lea	rsi, [r9 + 1]
+	jmp	.LBB2_14
+.LBB2_6:
+	mov	r11d, r8d
+	and	r11d, -32
+	lea	rax, [r11 - 32]
+	mov	r10, rax
+	shr	r10, 5
+	add	r10, 1
+	mov	r9d, r10d
+	and	r9d, 3
+	cmp	rax, 96
+	jae	.LBB2_8
+# %bb.7:
+	vpbroadcastq	ymm15, qword ptr [rip + .LCPI2_0] # ymm15 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+	vpbroadcastq	ymm11, qword ptr [rip + .LCPI2_1] # ymm11 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+	xor	eax, eax
+	vmovdqa	ymmword ptr [rsp + 32], ymm11   # 32-byte Spill
+	vmovdqa	ymm3, ymm11
+	vmovdqa	ymm9, ymm11
+	vmovdqa	ymm5, ymm11
+	vmovdqa	ymm4, ymm11
+	vmovdqa	ymm6, ymm11
+	vmovdqa	ymmword ptr [rsp + 96], ymm11   # 32-byte Spill
+	vmovdqa	ymmword ptr [rsp + 64], ymm15   # 32-byte Spill
+	vmovdqa	ymm2, ymm15
+	vmovdqa	ymm8, ymm15
+	vmovdqa	ymm12, ymm15
+	vmovdqa	ymm13, ymm15
+	vmovdqa	ymm14, ymm15
+	vmovdqa	ymmword ptr [rsp], ymm15        # 32-byte Spill
+	jmp	.LBB2_10
+.LBB2_8:
+	and	r10, -4
+	vpbroadcastq	ymm15, qword ptr [rip + .LCPI2_0] # ymm15 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+	neg	r10
+	vpbroadcastq	ymm11, qword ptr [rip + .LCPI2_1] # ymm11 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+	xor	eax, eax
+	vmovdqa	ymmword ptr [rsp + 32], ymm11   # 32-byte Spill
+	vmovdqa	ymm3, ymm11
+	vmovdqa	ymm9, ymm11
+	vmovdqa	ymm5, ymm11
+	vmovdqa	ymm4, ymm11
+	vmovdqa	ymm6, ymm11
+	vmovdqa	ymmword ptr [rsp + 96], ymm11   # 32-byte Spill
+	vmovdqa	ymmword ptr [rsp + 64], ymm15   # 32-byte Spill
+	vmovdqa	ymm2, ymm15
+	vmovdqa	ymm8, ymm15
+	vmovdqa	ymm12, ymm15
+	vmovdqa	ymm13, ymm15
+	vmovdqa	ymm14, ymm15
+	vmovdqa	ymmword ptr [rsp], ymm15        # 32-byte Spill
+	.p2align	4, 0x90
+.LBB2_9:                                # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 224]
+	vmovdqa	ymm10, ymm8
+	vmovdqa	ymm8, ymm2
+	vmovdqa	ymm2, ymm3
+	vmovdqa	ymm3, ymm9
+	vpcmpgtq	ymm9, ymm0, ymm11
+	vblendvpd	ymm1, ymm0, ymm11, ymm9
+	vmovapd	ymmword ptr [rsp + 160], ymm1   # 32-byte Spill
+	vpcmpgtq	ymm9, ymm15, ymm0
+	vblendvpd	ymm0, ymm0, ymm15, ymm9
+	vmovapd	ymmword ptr [rsp + 128], ymm0   # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 192]
+	vpcmpgtq	ymm9, ymm0, ymm6
+	vblendvpd	ymm7, ymm0, ymm6, ymm9
+	vpcmpgtq	ymm9, ymm14, ymm0
+	vblendvpd	ymm14, ymm0, ymm14, ymm9
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 160]
+	vpcmpgtq	ymm9, ymm0, ymm4
+	vblendvpd	ymm6, ymm0, ymm4, ymm9
+	vpcmpgtq	ymm9, ymm13, ymm0
+	vblendvpd	ymm13, ymm0, ymm13, ymm9
+	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax + 128]
+	vpcmpgtq	ymm0, ymm9, ymm5
+	vblendvpd	ymm1, ymm9, ymm5, ymm0
+	vpcmpgtq	ymm5, ymm12, ymm9
+	vblendvpd	ymm12, ymm9, ymm12, ymm5
+	vmovdqu	ymm5, ymmword ptr [rdi + 8*rax + 96]
+	vpcmpgtq	ymm9, ymm5, ymm3
+	vblendvpd	ymm9, ymm5, ymm3, ymm9
+	vpcmpgtq	ymm4, ymm10, ymm5
+	vblendvpd	ymm10, ymm5, ymm10, ymm4
+	vmovdqu	ymm4, ymmword ptr [rdi + 8*rax + 64]
+	vpcmpgtq	ymm5, ymm4, ymm2
+	vblendvpd	ymm5, ymm4, ymm2, ymm5
+	vpcmpgtq	ymm3, ymm8, ymm4
+	vblendvpd	ymm0, ymm4, ymm8, ymm3
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax]
+	vmovdqa	ymm4, ymmword ptr [rsp + 96]    # 32-byte Reload
+	vpcmpgtq	ymm3, ymm2, ymm4
+	vblendvpd	ymm3, ymm2, ymm4, ymm3
+	vmovdqa	ymm11, ymmword ptr [rsp]        # 32-byte Reload
+	vpcmpgtq	ymm4, ymm11, ymm2
+	vblendvpd	ymm4, ymm2, ymm11, ymm4
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 32]
+	vmovdqa	ymm15, ymmword ptr [rsp + 32]   # 32-byte Reload
+	vpcmpgtq	ymm11, ymm2, ymm15
+	vblendvpd	ymm11, ymm2, ymm15, ymm11
+	vmovdqa	ymm8, ymmword ptr [rsp + 64]    # 32-byte Reload
+	vpcmpgtq	ymm15, ymm8, ymm2
+	vblendvpd	ymm2, ymm2, ymm8, ymm15
+	vmovdqu	ymm8, ymmword ptr [rdi + 8*rax + 288]
+	vpcmpgtq	ymm15, ymm8, ymm11
+	vblendvpd	ymm11, ymm8, ymm11, ymm15
+	vmovapd	ymmword ptr [rsp + 32], ymm11   # 32-byte Spill
+	vpcmpgtq	ymm11, ymm2, ymm8
+	vblendvpd	ymm2, ymm8, ymm2, ymm11
+	vmovapd	ymmword ptr [rsp], ymm2         # 32-byte Spill
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 256]
+	vpcmpgtq	ymm2, ymm11, ymm3
+	vblendvpd	ymm8, ymm11, ymm3, ymm2
+	vpcmpgtq	ymm3, ymm4, ymm11
+	vblendvpd	ymm3, ymm11, ymm4, ymm3
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 320]
+	vpcmpgtq	ymm4, ymm11, ymm5
+	vblendvpd	ymm4, ymm11, ymm5, ymm4
+	vpcmpgtq	ymm5, ymm0, ymm11
+	vblendvpd	ymm5, ymm11, ymm0, ymm5
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 352]
+	vpcmpgtq	ymm11, ymm0, ymm9
+	vblendvpd	ymm9, ymm0, ymm9, ymm11
+	vpcmpgtq	ymm11, ymm10, ymm0
+	vblendvpd	ymm10, ymm0, ymm10, ymm11
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 384]
+	vpcmpgtq	ymm0, ymm11, ymm1
+	vblendvpd	ymm2, ymm11, ymm1, ymm0
+	vpcmpgtq	ymm1, ymm12, ymm11
+	vblendvpd	ymm12, ymm11, ymm12, ymm1
+	vmovdqu	ymm1, ymmword ptr [rdi + 8*rax + 416]
+	vpcmpgtq	ymm11, ymm1, ymm6
+	vblendvpd	ymm6, ymm1, ymm6, ymm11
+	vpcmpgtq	ymm11, ymm13, ymm1
+	vblendvpd	ymm1, ymm1, ymm13, ymm11
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 448]
+	vpcmpgtq	ymm13, ymm11, ymm7
+	vblendvpd	ymm7, ymm11, ymm7, ymm13
+	vpcmpgtq	ymm13, ymm14, ymm11
+	vblendvpd	ymm13, ymm11, ymm14, ymm13
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 480]
+	vmovdqa	ymm0, ymmword ptr [rsp + 160]   # 32-byte Reload
+	vpcmpgtq	ymm14, ymm11, ymm0
+	vblendvpd	ymm14, ymm11, ymm0, ymm14
+	vmovdqa	ymm0, ymmword ptr [rsp + 128]   # 32-byte Reload
+	vpcmpgtq	ymm15, ymm0, ymm11
+	vblendvpd	ymm15, ymm11, ymm0, ymm15
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 736]
+	vpcmpgtq	ymm11, ymm0, ymm14
+	vblendvpd	ymm11, ymm0, ymm14, ymm11
+	vmovapd	ymmword ptr [rsp + 160], ymm11  # 32-byte Spill
+	vpcmpgtq	ymm14, ymm15, ymm0
+	vblendvpd	ymm0, ymm0, ymm15, ymm14
+	vmovapd	ymmword ptr [rsp + 128], ymm0   # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 704]
+	vpcmpgtq	ymm14, ymm0, ymm7
+	vblendvpd	ymm7, ymm0, ymm7, ymm14
+	vpcmpgtq	ymm14, ymm13, ymm0
+	vblendvpd	ymm14, ymm0, ymm13, ymm14
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 672]
+	vpcmpgtq	ymm13, ymm0, ymm6
+	vblendvpd	ymm6, ymm0, ymm6, ymm13
+	vpcmpgtq	ymm13, ymm1, ymm0
+	vblendvpd	ymm13, ymm0, ymm1, ymm13
+	vmovdqu	ymm1, ymmword ptr [rdi + 8*rax + 640]
+	vpcmpgtq	ymm0, ymm1, ymm2
+	vblendvpd	ymm0, ymm1, ymm2, ymm0
+	vpcmpgtq	ymm2, ymm12, ymm1
+	vblendvpd	ymm12, ymm1, ymm12, ymm2
+	vmovdqu	ymm1, ymmword ptr [rdi + 8*rax + 608]
+	vpcmpgtq	ymm2, ymm1, ymm9
+	vblendvpd	ymm9, ymm1, ymm9, ymm2
+	vpcmpgtq	ymm2, ymm10, ymm1
+	vblendvpd	ymm10, ymm1, ymm10, ymm2
+	vmovdqu	ymm1, ymmword ptr [rdi + 8*rax + 576]
+	vpcmpgtq	ymm2, ymm1, ymm4
+	vblendvpd	ymm2, ymm1, ymm4, ymm2
+	vpcmpgtq	ymm4, ymm5, ymm1
+	vblendvpd	ymm1, ymm1, ymm5, ymm4
+	vmovdqu	ymm4, ymmword ptr [rdi + 8*rax + 512]
+	vpcmpgtq	ymm5, ymm4, ymm8
+	vblendvpd	ymm5, ymm4, ymm8, ymm5
+	vpcmpgtq	ymm8, ymm3, ymm4
+	vblendvpd	ymm3, ymm4, ymm3, ymm8
+	vmovdqu	ymm4, ymmword ptr [rdi + 8*rax + 544]
+	vmovdqa	ymm11, ymmword ptr [rsp + 32]   # 32-byte Reload
+	vpcmpgtq	ymm8, ymm4, ymm11
+	vblendvpd	ymm8, ymm4, ymm11, ymm8
+	vmovdqa	ymm15, ymmword ptr [rsp]        # 32-byte Reload
+	vpcmpgtq	ymm11, ymm15, ymm4
+	vblendvpd	ymm4, ymm4, ymm15, ymm11
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 800]
+	vpcmpgtq	ymm15, ymm11, ymm8
+	vblendvpd	ymm8, ymm11, ymm8, ymm15
+	vmovapd	ymmword ptr [rsp + 32], ymm8    # 32-byte Spill
+	vpcmpgtq	ymm8, ymm4, ymm11
+	vblendvpd	ymm4, ymm11, ymm4, ymm8
+	vmovapd	ymmword ptr [rsp + 64], ymm4    # 32-byte Spill
+	vmovdqu	ymm4, ymmword ptr [rdi + 8*rax + 768]
+	vpcmpgtq	ymm11, ymm4, ymm5
+	vblendvpd	ymm5, ymm4, ymm5, ymm11
+	vmovapd	ymmword ptr [rsp + 96], ymm5    # 32-byte Spill
+	vpcmpgtq	ymm5, ymm3, ymm4
+	vblendvpd	ymm3, ymm4, ymm3, ymm5
+	vmovapd	ymmword ptr [rsp], ymm3         # 32-byte Spill
+	vmovdqu	ymm4, ymmword ptr [rdi + 8*rax + 832]
+	vpcmpgtq	ymm3, ymm4, ymm2
+	vblendvpd	ymm3, ymm4, ymm2, ymm3
+	vpcmpgtq	ymm2, ymm1, ymm4
+	vblendvpd	ymm2, ymm4, ymm1, ymm2
+	vmovdqu	ymm1, ymmword ptr [rdi + 8*rax + 864]
+	vpcmpgtq	ymm4, ymm1, ymm9
+	vblendvpd	ymm9, ymm1, ymm9, ymm4
+	vpcmpgtq	ymm5, ymm10, ymm1
+	vblendvpd	ymm8, ymm1, ymm10, ymm5
+	vmovdqu	ymm1, ymmword ptr [rdi + 8*rax + 896]
+	vpcmpgtq	ymm5, ymm1, ymm0
+	vblendvpd	ymm5, ymm1, ymm0, ymm5
+	vpcmpgtq	ymm0, ymm12, ymm1
+	vblendvpd	ymm12, ymm1, ymm12, ymm0
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 928]
+	vpcmpgtq	ymm1, ymm0, ymm6
+	vblendvpd	ymm4, ymm0, ymm6, ymm1
+	vpcmpgtq	ymm1, ymm13, ymm0
+	vblendvpd	ymm13, ymm0, ymm13, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 960]
+	vpcmpgtq	ymm1, ymm0, ymm7
+	vblendvpd	ymm6, ymm0, ymm7, ymm1
+	vpcmpgtq	ymm1, ymm14, ymm0
+	vblendvpd	ymm14, ymm0, ymm14, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 992]
+	vmovdqa	ymm7, ymmword ptr [rsp + 160]   # 32-byte Reload
+	vpcmpgtq	ymm1, ymm0, ymm7
+	vblendvpd	ymm11, ymm0, ymm7, ymm1
+	vmovdqa	ymm7, ymmword ptr [rsp + 128]   # 32-byte Reload
+	vpcmpgtq	ymm1, ymm7, ymm0
+	vblendvpd	ymm15, ymm0, ymm7, ymm1
+	sub	rax, -128
+	add	r10, 4
+	jne	.LBB2_9
+.LBB2_10:
+	test	r9, r9
+	vmovdqa	ymm7, ymm5
+	vmovdqa	ymm5, ymm9
+	vmovdqa	ymm9, ymmword ptr [rsp + 96]    # 32-byte Reload
+	vmovdqa	ymm10, ymm3
+	je	.LBB2_13
+# %bb.11:
+	lea	rax, [rdi + 8*rax]
+	neg	r9
+	.p2align	4, 0x90
+.LBB2_12:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rax + 32]
+	vmovdqa	ymm3, ymmword ptr [rsp + 32]    # 32-byte Reload
+	vpcmpgtq	ymm1, ymm0, ymm3
+	vblendvpd	ymm3, ymm0, ymm3, ymm1
+	vmovapd	ymmword ptr [rsp + 32], ymm3    # 32-byte Spill
+	vmovdqa	ymm3, ymmword ptr [rsp + 64]    # 32-byte Reload
+	vpcmpgtq	ymm1, ymm3, ymm0
+	vblendvpd	ymm3, ymm0, ymm3, ymm1
+	vmovapd	ymmword ptr [rsp + 64], ymm3    # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rax]
+	vpcmpgtq	ymm1, ymm0, ymm9
+	vblendvpd	ymm9, ymm0, ymm9, ymm1
+	vmovdqa	ymm3, ymmword ptr [rsp]         # 32-byte Reload
+	vpcmpgtq	ymm1, ymm3, ymm0
+	vblendvpd	ymm3, ymm0, ymm3, ymm1
+	vmovapd	ymmword ptr [rsp], ymm3         # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rax + 64]
+	vpcmpgtq	ymm1, ymm0, ymm10
+	vblendvpd	ymm10, ymm0, ymm10, ymm1
+	vpcmpgtq	ymm1, ymm2, ymm0
+	vblendvpd	ymm2, ymm0, ymm2, ymm1
+	vmovdqu	ymm0, ymmword ptr [rax + 96]
+	vpcmpgtq	ymm1, ymm0, ymm5
+	vblendvpd	ymm5, ymm0, ymm5, ymm1
+	vpcmpgtq	ymm1, ymm8, ymm0
+	vblendvpd	ymm8, ymm0, ymm8, ymm1
+	vmovdqu	ymm0, ymmword ptr [rax + 128]
+	vpcmpgtq	ymm1, ymm0, ymm7
+	vblendvpd	ymm7, ymm0, ymm7, ymm1
+	vpcmpgtq	ymm1, ymm12, ymm0
+	vblendvpd	ymm12, ymm0, ymm12, ymm1
+	vmovdqu	ymm0, ymmword ptr [rax + 160]
+	vpcmpgtq	ymm1, ymm0, ymm4
+	vblendvpd	ymm4, ymm0, ymm4, ymm1
+	vpcmpgtq	ymm1, ymm13, ymm0
+	vblendvpd	ymm13, ymm0, ymm13, ymm1
+	vmovdqu	ymm0, ymmword ptr [rax + 192]
+	vpcmpgtq	ymm1, ymm0, ymm6
+	vblendvpd	ymm6, ymm0, ymm6, ymm1
+	vpcmpgtq	ymm1, ymm14, ymm0
+	vblendvpd	ymm14, ymm0, ymm14, ymm1
+	vmovdqu	ymm0, ymmword ptr [rax + 224]
+	vpcmpgtq	ymm1, ymm0, ymm11
+	vblendvpd	ymm11, ymm0, ymm11, ymm1
+	vpcmpgtq	ymm1, ymm15, ymm0
+	vblendvpd	ymm15, ymm0, ymm15, ymm1
+	add	rax, 256
+	inc	r9
+	jne	.LBB2_12
+.LBB2_13:
+	vmovdqa	ymm1, ymmword ptr [rsp + 64]    # 32-byte Reload
+	vpcmpgtq	ymm0, ymm1, ymm13
+	vblendvpd	ymm0, ymm13, ymm1, ymm0
+	vpcmpgtq	ymm1, ymm8, ymm15
+	vblendvpd	ymm1, ymm15, ymm8, ymm1
+	vmovdqa	ymm3, ymmword ptr [rsp]         # 32-byte Reload
+	vpcmpgtq	ymm8, ymm3, ymm12
+	vblendvpd	ymm8, ymm12, ymm3, ymm8
+	vmovdqa	ymm3, ymm9
+	vpcmpgtq	ymm9, ymm2, ymm14
+	vblendvpd	ymm2, ymm14, ymm2, ymm9
+	vpcmpgtq	ymm9, ymm8, ymm2
+	vblendvpd	ymm2, ymm2, ymm8, ymm9
+	vpcmpgtq	ymm8, ymm0, ymm1
+	vblendvpd	ymm0, ymm1, ymm0, ymm8
+	vpcmpgtq	ymm1, ymm2, ymm0
+	vblendvpd	ymm0, ymm0, ymm2, ymm1
+	vextractf128	xmm1, ymm0, 1
+	vpcmpgtq	xmm2, xmm0, xmm1
+	vblendvpd	xmm0, xmm1, xmm0, xmm2
+	vpermilps	xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
+	vpcmpgtq	xmm2, xmm0, xmm1
+	vblendvpd	xmm0, xmm1, xmm0, xmm2
+	vmovdqa	ymm2, ymmword ptr [rsp + 32]    # 32-byte Reload
+	vpcmpgtq	ymm1, ymm4, ymm2
+	vblendvpd	ymm1, ymm4, ymm2, ymm1
+	vpcmpgtq	ymm2, ymm11, ymm5
+	vblendvpd	ymm2, ymm11, ymm5, ymm2
+	vpcmpgtq	ymm4, ymm7, ymm3
+	vblendvpd	ymm4, ymm7, ymm3, ymm4
+	vpcmpgtq	ymm5, ymm6, ymm10
+	vblendvpd	ymm3, ymm6, ymm10, ymm5
+	vpcmpgtq	ymm5, ymm3, ymm4
+	vblendvpd	ymm3, ymm3, ymm4, ymm5
+	vpcmpgtq	ymm4, ymm2, ymm1
+	vblendvpd	ymm1, ymm2, ymm1, ymm4
+	vpcmpgtq	ymm2, ymm1, ymm3
+	vblendvpd	ymm1, ymm1, ymm3, ymm2
+	vextractf128	xmm2, ymm1, 1
+	vpcmpgtq	xmm3, xmm2, xmm1
+	vblendvpd	xmm1, xmm2, xmm1, xmm3
+	vpermilps	xmm2, xmm1, 78          # xmm2 = xmm1[2,3,0,1]
+	vpcmpgtq	xmm3, xmm2, xmm1
+	vblendvpd	xmm1, xmm2, xmm1, xmm3
+	vmovq	rsi, xmm0
+	vmovq	r9, xmm1
+	cmp	r11, r8
+	je	.LBB2_14
+.LBB2_4:
+	mov	rax, rsi
+	.p2align	4, 0x90
+.LBB2_5:                                # =>This Inner Loop Header: Depth=1
+	mov	rsi, qword ptr [rdi + 8*r11]
+	cmp	r9, rsi
+	cmovg	r9, rsi
+	cmp	rax, rsi
+	cmovge	rsi, rax
+	add	r11, 1
+	mov	rax, rsi
+	cmp	r8, r11
+	jne	.LBB2_5
+.LBB2_14:
+	mov	qword ptr [rcx], rsi
+	mov	qword ptr [rdx], r9
+	mov	rsp, rbp
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end2:
+	.size	int64_max_min_avx2, .Lfunc_end2-int64_max_min_avx2
+                                        # -- End function
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3                               # -- Begin function uint64_max_min_avx2
+.LCPI3_0:
+	.quad	-9223372036854775808            # 0x8000000000000000
+	.text
+	.globl	uint64_max_min_avx2
+	.p2align	4, 0x90
+	.type	uint64_max_min_avx2,@function
+uint64_max_min_avx2:                    # @uint64_max_min_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -32
+	sub	rsp, 288
+	test	esi, esi
+	jle	.LBB3_1
+# %bb.2:
+	mov	r8d, esi
+	cmp	esi, 31
+	ja	.LBB3_6
+# %bb.3:
+	mov	r9, -1
+	xor	r11d, r11d
+	xor	esi, esi
+	jmp	.LBB3_4
+.LBB3_1:
+	mov	r9, -1
+	xor	esi, esi
+	jmp	.LBB3_14
+.LBB3_6:
+	mov	r11d, r8d
+	and	r11d, -32
+	lea	rax, [r11 - 32]
+	mov	r10, rax
+	shr	r10, 5
+	add	r10, 1
+	mov	r9d, r10d
+	and	r9d, 3
+	cmp	rax, 96
+	jae	.LBB3_8
+# %bb.7:
+	vpxor	xmm4, xmm4, xmm4
+	vpcmpeqd	ymm0, ymm0, ymm0
+	vmovdqa	ymmword ptr [rsp + 64], ymm0    # 32-byte Spill
+	xor	eax, eax
+	vpcmpeqd	ymm0, ymm0, ymm0
+	vmovdqa	ymmword ptr [rsp + 96], ymm0    # 32-byte Spill
+	vpcmpeqd	ymm5, ymm5, ymm5
+	vpcmpeqd	ymm7, ymm7, ymm7
+	vpcmpeqd	ymm12, ymm12, ymm12
+	vpcmpeqd	ymm10, ymm10, ymm10
+	vpcmpeqd	ymm11, ymm11, ymm11
+	vpcmpeqd	ymm13, ymm13, ymm13
+	vpxor	xmm0, xmm0, xmm0
+	vmovdqa	ymmword ptr [rsp + 32], ymm0    # 32-byte Spill
+	vpxor	xmm0, xmm0, xmm0
+	vmovdqa	ymmword ptr [rsp], ymm0         # 32-byte Spill
+	vpxor	xmm3, xmm3, xmm3
+	vpxor	xmm9, xmm9, xmm9
+	vpxor	xmm8, xmm8, xmm8
+	vpxor	xmm15, xmm15, xmm15
+	vpxor	xmm0, xmm0, xmm0
+	jmp	.LBB3_10
+.LBB3_8:
+	and	r10, -4
+	neg	r10
+	vpxor	xmm4, xmm4, xmm4
+	vpcmpeqd	ymm0, ymm0, ymm0
+	vmovdqa	ymmword ptr [rsp + 64], ymm0    # 32-byte Spill
+	xor	eax, eax
+	vpbroadcastq	ymm14, qword ptr [rip + .LCPI3_0] # ymm14 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+	vpcmpeqd	ymm0, ymm0, ymm0
+	vmovdqa	ymmword ptr [rsp + 96], ymm0    # 32-byte Spill
+	vpcmpeqd	ymm5, ymm5, ymm5
+	vpcmpeqd	ymm7, ymm7, ymm7
+	vpcmpeqd	ymm12, ymm12, ymm12
+	vpcmpeqd	ymm10, ymm10, ymm10
+	vpcmpeqd	ymm11, ymm11, ymm11
+	vpcmpeqd	ymm13, ymm13, ymm13
+	vpxor	xmm0, xmm0, xmm0
+	vmovdqa	ymmword ptr [rsp + 32], ymm0    # 32-byte Spill
+	vpxor	xmm0, xmm0, xmm0
+	vmovdqa	ymmword ptr [rsp], ymm0         # 32-byte Spill
+	vpxor	xmm3, xmm3, xmm3
+	vpxor	xmm9, xmm9, xmm9
+	vpxor	xmm8, xmm8, xmm8
+	vpxor	xmm15, xmm15, xmm15
+	vpxor	xmm0, xmm0, xmm0
+	.p2align	4, 0x90
+.LBB3_9:                                # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdi + 8*rax + 224]
+	vpxor	ymm2, ymm14, ymm1
+	vmovdqa	ymm6, ymm3
+	vpxor	ymm3, ymm13, ymm14
+	vpcmpgtq	ymm3, ymm2, ymm3
+	vblendvpd	ymm3, ymm1, ymm13, ymm3
+	vmovapd	ymmword ptr [rsp + 128], ymm3   # 32-byte Spill
+	vpxor	ymm3, ymm14, ymm0
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm0, ymm1, ymm0, ymm2
+	vmovapd	ymmword ptr [rsp + 224], ymm0   # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 192]
+	vpxor	ymm1, ymm14, ymm0
+	vpxor	ymm2, ymm11, ymm14
+	vpcmpgtq	ymm2, ymm1, ymm2
+	vblendvpd	ymm2, ymm0, ymm11, ymm2
+	vmovapd	ymmword ptr [rsp + 160], ymm2   # 32-byte Spill
+	vpxor	ymm2, ymm15, ymm14
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm0, ymm0, ymm15, ymm1
+	vmovapd	ymmword ptr [rsp + 192], ymm0   # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 160]
+	vpxor	ymm1, ymm14, ymm0
+	vpxor	ymm2, ymm10, ymm14
+	vpcmpgtq	ymm2, ymm1, ymm2
+	vmovdqa	ymm3, ymm8
+	vblendvpd	ymm8, ymm0, ymm10, ymm2
+	vpxor	ymm2, ymm14, ymm3
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm13, ymm0, ymm3, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 128]
+	vpxor	ymm2, ymm14, ymm0
+	vpxor	ymm1, ymm12, ymm14
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm1, ymm0, ymm12, ymm1
+	vpxor	ymm3, ymm9, ymm14
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm12, ymm0, ymm9, ymm2
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 96]
+	vpxor	ymm0, ymm14, ymm7
+	vpxor	ymm3, ymm14, ymm2
+	vpcmpgtq	ymm0, ymm3, ymm0
+	vblendvpd	ymm0, ymm2, ymm7, ymm0
+	vmovdqa	ymm15, ymm4
+	vpxor	ymm4, ymm14, ymm6
+	vpcmpgtq	ymm3, ymm4, ymm3
+	vblendvpd	ymm10, ymm2, ymm6, ymm3
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 64]
+	vpxor	ymm3, ymm14, ymm5
+	vpxor	ymm4, ymm14, ymm2
+	vpcmpgtq	ymm3, ymm4, ymm3
+	vblendvpd	ymm5, ymm2, ymm5, ymm3
+	vmovdqa	ymm6, ymmword ptr [rsp]         # 32-byte Reload
+	vpxor	ymm3, ymm14, ymm6
+	vpcmpgtq	ymm3, ymm3, ymm4
+	vblendvpd	ymm9, ymm2, ymm6, ymm3
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax]
+	vmovdqa	ymm7, ymmword ptr [rsp + 64]    # 32-byte Reload
+	vpxor	ymm3, ymm14, ymm7
+	vpxor	ymm4, ymm14, ymm2
+	vpcmpgtq	ymm3, ymm4, ymm3
+	vblendvpd	ymm3, ymm2, ymm7, ymm3
+	vpxor	ymm11, ymm15, ymm14
+	vpcmpgtq	ymm4, ymm11, ymm4
+	vblendvpd	ymm4, ymm2, ymm15, ymm4
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 32]
+	vmovdqa	ymm15, ymmword ptr [rsp + 96]   # 32-byte Reload
+	vpxor	ymm11, ymm15, ymm14
+	vpxor	ymm7, ymm14, ymm2
+	vpcmpgtq	ymm11, ymm7, ymm11
+	vblendvpd	ymm11, ymm2, ymm15, ymm11
+	vmovdqa	ymm6, ymmword ptr [rsp + 32]    # 32-byte Reload
+	vpxor	ymm15, ymm14, ymm6
+	vpcmpgtq	ymm7, ymm15, ymm7
+	vblendvpd	ymm2, ymm2, ymm6, ymm7
+	vmovdqu	ymm6, ymmword ptr [rdi + 8*rax + 288]
+	vxorpd	ymm7, ymm11, ymm14
+	vpxor	ymm15, ymm14, ymm6
+	vpcmpgtq	ymm7, ymm15, ymm7
+	vblendvpd	ymm7, ymm6, ymm11, ymm7
+	vmovapd	ymmword ptr [rsp + 96], ymm7    # 32-byte Spill
+	vxorpd	ymm7, ymm14, ymm2
+	vpcmpgtq	ymm7, ymm7, ymm15
+	vblendvpd	ymm2, ymm6, ymm2, ymm7
+	vmovapd	ymmword ptr [rsp + 64], ymm2    # 32-byte Spill
+	vmovdqu	ymm6, ymmword ptr [rdi + 8*rax + 256]
+	vxorpd	ymm7, ymm14, ymm3
+	vpxor	ymm11, ymm14, ymm6
+	vpcmpgtq	ymm7, ymm11, ymm7
+	vblendvpd	ymm2, ymm6, ymm3, ymm7
+	vmovapd	ymmword ptr [rsp], ymm2         # 32-byte Spill
+	vxorpd	ymm7, ymm14, ymm4
+	vpcmpgtq	ymm7, ymm7, ymm11
+	vblendvpd	ymm2, ymm6, ymm4, ymm7
+	vmovapd	ymmword ptr [rsp + 32], ymm2    # 32-byte Spill
+	vmovdqu	ymm6, ymmword ptr [rdi + 8*rax + 320]
+	vxorpd	ymm7, ymm14, ymm5
+	vpxor	ymm11, ymm14, ymm6
+	vpcmpgtq	ymm7, ymm11, ymm7
+	vblendvpd	ymm5, ymm6, ymm5, ymm7
+	vxorpd	ymm7, ymm9, ymm14
+	vpcmpgtq	ymm7, ymm7, ymm11
+	vblendvpd	ymm7, ymm6, ymm9, ymm7
+	vmovdqu	ymm6, ymmword ptr [rdi + 8*rax + 352]
+	vxorpd	ymm9, ymm14, ymm0
+	vpxor	ymm11, ymm14, ymm6
+	vpcmpgtq	ymm9, ymm11, ymm9
+	vblendvpd	ymm9, ymm6, ymm0, ymm9
+	vxorpd	ymm0, ymm10, ymm14
+	vpcmpgtq	ymm0, ymm0, ymm11
+	vblendvpd	ymm10, ymm6, ymm10, ymm0
+	vmovdqu	ymm6, ymmword ptr [rdi + 8*rax + 384]
+	vxorpd	ymm0, ymm14, ymm1
+	vpxor	ymm11, ymm14, ymm6
+	vpcmpgtq	ymm0, ymm11, ymm0
+	vblendvpd	ymm4, ymm6, ymm1, ymm0
+	vxorpd	ymm1, ymm12, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm11
+	vblendvpd	ymm3, ymm6, ymm12, ymm1
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 416]
+	vxorpd	ymm6, ymm8, ymm14
+	vpxor	ymm12, ymm11, ymm14
+	vpcmpgtq	ymm6, ymm12, ymm6
+	vblendvpd	ymm6, ymm11, ymm8, ymm6
+	vxorpd	ymm8, ymm13, ymm14
+	vpcmpgtq	ymm8, ymm8, ymm12
+	vblendvpd	ymm12, ymm11, ymm13, ymm8
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 448]
+	vmovdqa	ymm0, ymmword ptr [rsp + 160]   # 32-byte Reload
+	vpxor	ymm8, ymm14, ymm0
+	vpxor	ymm13, ymm11, ymm14
+	vpcmpgtq	ymm8, ymm13, ymm8
+	vblendvpd	ymm8, ymm11, ymm0, ymm8
+	vmovdqa	ymm0, ymmword ptr [rsp + 192]   # 32-byte Reload
+	vpxor	ymm15, ymm14, ymm0
+	vpcmpgtq	ymm13, ymm15, ymm13
+	vblendvpd	ymm13, ymm11, ymm0, ymm13
+	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 480]
+	vmovdqa	ymm1, ymmword ptr [rsp + 128]   # 32-byte Reload
+	vpxor	ymm15, ymm14, ymm1
+	vpxor	ymm0, ymm11, ymm14
+	vpcmpgtq	ymm15, ymm0, ymm15
+	vblendvpd	ymm1, ymm11, ymm1, ymm15
+	vmovdqa	ymm2, ymmword ptr [rsp + 224]   # 32-byte Reload
+	vpxor	ymm15, ymm14, ymm2
+	vpcmpgtq	ymm0, ymm15, ymm0
+	vblendvpd	ymm15, ymm11, ymm2, ymm0
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 736]
+	vxorpd	ymm11, ymm14, ymm1
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm11, ymm2, ymm11
+	vblendvpd	ymm1, ymm0, ymm1, ymm11
+	vmovapd	ymmword ptr [rsp + 128], ymm1   # 32-byte Spill
+	vxorpd	ymm1, ymm15, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm0, ymm0, ymm15, ymm1
+	vmovapd	ymmword ptr [rsp + 224], ymm0   # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 704]
+	vxorpd	ymm1, ymm8, ymm14
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm1, ymm0, ymm8, ymm1
+	vmovapd	ymmword ptr [rsp + 160], ymm1   # 32-byte Spill
+	vxorpd	ymm1, ymm13, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm0, ymm0, ymm13, ymm1
+	vmovapd	ymmword ptr [rsp + 192], ymm0   # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 672]
+	vxorpd	ymm1, ymm14, ymm6
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm15, ymm0, ymm6, ymm1
+	vxorpd	ymm1, ymm12, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm13, ymm0, ymm12, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 640]
+	vxorpd	ymm1, ymm14, ymm4
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm12, ymm0, ymm4, ymm1
+	vxorpd	ymm1, ymm14, ymm3
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm8, ymm0, ymm3, ymm1
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 608]
+	vxorpd	ymm1, ymm9, ymm14
+	vpxor	ymm3, ymm14, ymm2
+	vpcmpgtq	ymm1, ymm3, ymm1
+	vblendvpd	ymm1, ymm2, ymm9, ymm1
+	vxorpd	ymm4, ymm10, ymm14
+	vpcmpgtq	ymm3, ymm4, ymm3
+	vblendvpd	ymm10, ymm2, ymm10, ymm3
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 576]
+	vxorpd	ymm3, ymm14, ymm5
+	vpxor	ymm4, ymm14, ymm2
+	vpcmpgtq	ymm3, ymm4, ymm3
+	vblendvpd	ymm5, ymm2, ymm5, ymm3
+	vxorpd	ymm3, ymm14, ymm7
+	vpcmpgtq	ymm3, ymm3, ymm4
+	vblendvpd	ymm9, ymm2, ymm7, ymm3
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 512]
+	vmovdqa	ymm0, ymmword ptr [rsp]         # 32-byte Reload
+	vpxor	ymm3, ymm14, ymm0
+	vpxor	ymm4, ymm14, ymm2
+	vpcmpgtq	ymm3, ymm4, ymm3
+	vblendvpd	ymm3, ymm2, ymm0, ymm3
+	vmovdqa	ymm0, ymmword ptr [rsp + 32]    # 32-byte Reload
+	vpxor	ymm6, ymm14, ymm0
+	vpcmpgtq	ymm4, ymm6, ymm4
+	vblendvpd	ymm4, ymm2, ymm0, ymm4
+	vmovdqu	ymm2, ymmword ptr [rdi + 8*rax + 544]
+	vmovdqa	ymm0, ymmword ptr [rsp + 96]    # 32-byte Reload
+	vpxor	ymm6, ymm14, ymm0
+	vpxor	ymm7, ymm14, ymm2
+	vpcmpgtq	ymm6, ymm7, ymm6
+	vblendvpd	ymm6, ymm2, ymm0, ymm6
+	vmovdqa	ymm0, ymmword ptr [rsp + 64]    # 32-byte Reload
+	vpxor	ymm11, ymm14, ymm0
+	vpcmpgtq	ymm7, ymm11, ymm7
+	vblendvpd	ymm2, ymm2, ymm0, ymm7
+	vmovdqu	ymm7, ymmword ptr [rdi + 8*rax + 800]
+	vxorpd	ymm11, ymm14, ymm6
+	vpxor	ymm0, ymm14, ymm7
+	vpcmpgtq	ymm11, ymm0, ymm11
+	vblendvpd	ymm6, ymm7, ymm6, ymm11
+	vmovapd	ymmword ptr [rsp + 96], ymm6    # 32-byte Spill
+	vxorpd	ymm6, ymm14, ymm2
+	vpcmpgtq	ymm0, ymm6, ymm0
+	vblendvpd	ymm0, ymm7, ymm2, ymm0
+	vmovapd	ymmword ptr [rsp + 32], ymm0    # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 768]
+	vxorpd	ymm2, ymm14, ymm3
+	vpxor	ymm7, ymm14, ymm0
+	vpcmpgtq	ymm2, ymm7, ymm2
+	vblendvpd	ymm2, ymm0, ymm3, ymm2
+	vmovapd	ymmword ptr [rsp + 64], ymm2    # 32-byte Spill
+	vxorpd	ymm2, ymm14, ymm4
+	vpcmpgtq	ymm2, ymm2, ymm7
+	vblendvpd	ymm4, ymm0, ymm4, ymm2
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 832]
+	vxorpd	ymm2, ymm14, ymm5
+	vpxor	ymm3, ymm14, ymm0
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm5, ymm0, ymm5, ymm2
+	vxorpd	ymm2, ymm9, ymm14
+	vpcmpgtq	ymm2, ymm2, ymm3
+	vblendvpd	ymm0, ymm0, ymm9, ymm2
+	vmovapd	ymmword ptr [rsp], ymm0         # 32-byte Spill
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 864]
+	vxorpd	ymm2, ymm14, ymm1
+	vpxor	ymm3, ymm14, ymm0
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm7, ymm0, ymm1, ymm2
+	vxorpd	ymm1, ymm10, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm3
+	vblendvpd	ymm3, ymm0, ymm10, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 896]
+	vxorpd	ymm1, ymm12, ymm14
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm12, ymm0, ymm12, ymm1
+	vxorpd	ymm1, ymm8, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm9, ymm0, ymm8, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 928]
+	vxorpd	ymm1, ymm15, ymm14
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm10, ymm0, ymm15, ymm1
+	vxorpd	ymm1, ymm13, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm8, ymm0, ymm13, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 960]
+	vmovdqa	ymm6, ymmword ptr [rsp + 160]   # 32-byte Reload
+	vpxor	ymm1, ymm14, ymm6
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm11, ymm0, ymm6, ymm1
+	vmovdqa	ymm6, ymmword ptr [rsp + 192]   # 32-byte Reload
+	vpxor	ymm1, ymm14, ymm6
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm15, ymm0, ymm6, ymm1
+	vmovdqu	ymm0, ymmword ptr [rdi + 8*rax + 992]
+	vmovdqa	ymm6, ymmword ptr [rsp + 128]   # 32-byte Reload
+	vpxor	ymm1, ymm14, ymm6
+	vpxor	ymm2, ymm14, ymm0
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm13, ymm0, ymm6, ymm1
+	vmovdqa	ymm6, ymmword ptr [rsp + 224]   # 32-byte Reload
+	vpxor	ymm1, ymm14, ymm6
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm0, ymm0, ymm6, ymm1
+	sub	rax, -128
+	add	r10, 4
+	jne	.LBB3_9
+.LBB3_10:
+	vmovaps	ymmword ptr [rsp + 128], ymm10  # 32-byte Spill
+	test	r9, r9
+	vmovdqa	ymm10, ymm12
+	vmovdqa	ymm12, ymm3
+	je	.LBB3_13
+# %bb.11:
+	lea	rax, [rdi + 8*rax]
+	neg	r9
+	vpbroadcastq	ymm14, qword ptr [rip + .LCPI3_0] # ymm14 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+	.p2align	4, 0x90
+.LBB3_12:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rax + 32]
+	vmovdqa	ymm6, ymm7
+	vmovdqa	ymm7, ymm5
+	vmovdqa	ymm5, ymm4
+	vmovdqa	ymm4, ymmword ptr [rsp + 96]    # 32-byte Reload
+	vpxor	ymm2, ymm14, ymm4
+	vpxor	ymm3, ymm14, ymm1
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm4, ymm1, ymm4, ymm2
+	vmovapd	ymmword ptr [rsp + 96], ymm4    # 32-byte Spill
+	vmovdqa	ymm4, ymmword ptr [rsp + 32]    # 32-byte Reload
+	vpxor	ymm2, ymm14, ymm4
+	vpcmpgtq	ymm2, ymm2, ymm3
+	vblendvpd	ymm4, ymm1, ymm4, ymm2
+	vmovapd	ymmword ptr [rsp + 32], ymm4    # 32-byte Spill
+	vmovdqu	ymm1, ymmword ptr [rax]
+	vmovdqa	ymm4, ymmword ptr [rsp + 64]    # 32-byte Reload
+	vpxor	ymm2, ymm14, ymm4
+	vpxor	ymm3, ymm14, ymm1
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm4, ymm1, ymm4, ymm2
+	vmovapd	ymmword ptr [rsp + 64], ymm4    # 32-byte Spill
+	vmovdqa	ymm4, ymm5
+	vmovdqa	ymm5, ymm7
+	vmovdqa	ymm7, ymm6
+	vpxor	ymm2, ymm14, ymm4
+	vpcmpgtq	ymm2, ymm2, ymm3
+	vmovdqu	ymm3, ymmword ptr [rax + 64]
+	vblendvpd	ymm4, ymm1, ymm4, ymm2
+	vpxor	ymm1, ymm14, ymm3
+	vpxor	ymm2, ymm14, ymm5
+	vpcmpgtq	ymm2, ymm1, ymm2
+	vblendvpd	ymm5, ymm3, ymm5, ymm2
+	vmovdqa	ymm6, ymmword ptr [rsp]         # 32-byte Reload
+	vpxor	ymm2, ymm14, ymm6
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm6, ymm3, ymm6, ymm1
+	vmovapd	ymmword ptr [rsp], ymm6         # 32-byte Spill
+	vmovdqu	ymm1, ymmword ptr [rax + 96]
+	vpxor	ymm2, ymm14, ymm1
+	vpxor	ymm3, ymm14, ymm7
+	vpcmpgtq	ymm3, ymm2, ymm3
+	vblendvpd	ymm7, ymm1, ymm7, ymm3
+	vpxor	ymm3, ymm12, ymm14
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vmovdqu	ymm3, ymmword ptr [rax + 128]
+	vblendvpd	ymm12, ymm1, ymm12, ymm2
+	vpxor	ymm1, ymm14, ymm3
+	vpxor	ymm2, ymm10, ymm14
+	vpcmpgtq	ymm2, ymm1, ymm2
+	vblendvpd	ymm10, ymm3, ymm10, ymm2
+	vpxor	ymm2, ymm9, ymm14
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm9, ymm3, ymm9, ymm1
+	vmovdqu	ymm1, ymmword ptr [rax + 160]
+	vpxor	ymm2, ymm14, ymm1
+	vmovdqa	ymm6, ymmword ptr [rsp + 128]   # 32-byte Reload
+	vpxor	ymm3, ymm14, ymm6
+	vpcmpgtq	ymm3, ymm2, ymm3
+	vblendvpd	ymm6, ymm1, ymm6, ymm3
+	vmovapd	ymmword ptr [rsp + 128], ymm6   # 32-byte Spill
+	vpxor	ymm3, ymm8, ymm14
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vmovdqu	ymm3, ymmword ptr [rax + 192]
+	vblendvpd	ymm8, ymm1, ymm8, ymm2
+	vpxor	ymm1, ymm14, ymm3
+	vpxor	ymm2, ymm11, ymm14
+	vpcmpgtq	ymm2, ymm1, ymm2
+	vblendvpd	ymm11, ymm3, ymm11, ymm2
+	vpxor	ymm2, ymm15, ymm14
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm15, ymm3, ymm15, ymm1
+	vmovdqu	ymm1, ymmword ptr [rax + 224]
+	vpxor	ymm2, ymm14, ymm1
+	vpxor	ymm3, ymm13, ymm14
+	vpcmpgtq	ymm3, ymm2, ymm3
+	vblendvpd	ymm13, ymm1, ymm13, ymm3
+	vpxor	ymm3, ymm14, ymm0
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm0, ymm1, ymm0, ymm2
+	add	rax, 256
+	inc	r9
+	jne	.LBB3_12
+.LBB3_13:
+	vpbroadcastq	ymm14, qword ptr [rip + .LCPI3_0] # ymm14 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+	vmovdqa	ymm3, ymmword ptr [rsp]         # 32-byte Reload
+	vpxor	ymm1, ymm14, ymm3
+	vpxor	ymm2, ymm15, ymm14
+	vpcmpgtq	ymm1, ymm1, ymm2
+	vblendvpd	ymm1, ymm15, ymm3, ymm1
+	vpxor	ymm2, ymm14, ymm4
+	vpxor	ymm3, ymm9, ymm14
+	vpcmpgtq	ymm2, ymm2, ymm3
+	vblendvpd	ymm2, ymm9, ymm4, ymm2
+	vpxor	ymm3, ymm12, ymm14
+	vpxor	ymm9, ymm14, ymm0
+	vpcmpgtq	ymm3, ymm3, ymm9
+	vblendvpd	ymm0, ymm0, ymm12, ymm3
+	vmovdqa	ymm4, ymmword ptr [rsp + 32]    # 32-byte Reload
+	vpxor	ymm3, ymm14, ymm4
+	vpxor	ymm9, ymm8, ymm14
+	vpcmpgtq	ymm3, ymm3, ymm9
+	vblendvpd	ymm3, ymm8, ymm4, ymm3
+	vxorpd	ymm6, ymm14, ymm3
+	vxorpd	ymm9, ymm14, ymm0
+	vpcmpgtq	ymm6, ymm6, ymm9
+	vblendvpd	ymm0, ymm0, ymm3, ymm6
+	vxorpd	ymm3, ymm14, ymm2
+	vxorpd	ymm6, ymm14, ymm1
+	vpcmpgtq	ymm3, ymm3, ymm6
+	vblendvpd	ymm1, ymm1, ymm2, ymm3
+	vxorpd	ymm2, ymm14, ymm1
+	vxorpd	ymm3, ymm14, ymm0
+	vpcmpgtq	ymm2, ymm2, ymm3
+	vblendvpd	ymm0, ymm0, ymm1, ymm2
+	vextractf128	xmm1, ymm0, 1
+	vxorpd	xmm2, xmm14, xmm1
+	vxorpd	xmm3, xmm14, xmm0
+	vpcmpgtq	xmm2, xmm3, xmm2
+	vblendvpd	xmm0, xmm1, xmm0, xmm2
+	vpermilps	xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
+	vxorpd	xmm2, xmm14, xmm0
+	vxorpd	xmm3, xmm14, xmm1
+	vpcmpgtq	xmm2, xmm2, xmm3
+	vblendvpd	xmm0, xmm1, xmm0, xmm2
+	vpxor	ymm1, ymm14, ymm5
+	vpxor	ymm2, ymm11, ymm14
+	vpcmpgtq	ymm1, ymm2, ymm1
+	vblendvpd	ymm1, ymm11, ymm5, ymm1
+	vmovdqa	ymm4, ymmword ptr [rsp + 64]    # 32-byte Reload
+	vpxor	ymm2, ymm14, ymm4
+	vpxor	ymm3, ymm10, ymm14
+	vpcmpgtq	ymm2, ymm3, ymm2
+	vblendvpd	ymm2, ymm10, ymm4, ymm2
+	vpxor	ymm3, ymm14, ymm7
+	vpxor	ymm5, ymm13, ymm14
+	vpcmpgtq	ymm3, ymm5, ymm3
+	vblendvpd	ymm3, ymm13, ymm7, ymm3
+	vmovdqa	ymm6, ymmword ptr [rsp + 96]    # 32-byte Reload
+	vpxor	ymm4, ymm14, ymm6
+	vmovdqa	ymm7, ymmword ptr [rsp + 128]   # 32-byte Reload
+	vpxor	ymm5, ymm14, ymm7
+	vpcmpgtq	ymm4, ymm5, ymm4
+	vblendvpd	ymm4, ymm7, ymm6, ymm4
+	vxorpd	ymm5, ymm14, ymm4
+	vxorpd	ymm6, ymm14, ymm3
+	vpcmpgtq	ymm5, ymm6, ymm5
+	vblendvpd	ymm3, ymm3, ymm4, ymm5
+	vxorpd	ymm4, ymm14, ymm2
+	vxorpd	ymm5, ymm14, ymm1
+	vpcmpgtq	ymm4, ymm5, ymm4
+	vblendvpd	ymm1, ymm1, ymm2, ymm4
+	vxorpd	ymm2, ymm14, ymm1
+	vxorpd	ymm4, ymm14, ymm3
+	vpcmpgtq	ymm2, ymm4, ymm2
+	vblendvpd	ymm1, ymm3, ymm1, ymm2
+	vextractf128	xmm2, ymm1, 1
+	vxorpd	xmm3, xmm14, xmm1
+	vxorpd	xmm4, xmm14, xmm2
+	vpcmpgtq	xmm3, xmm4, xmm3
+	vblendvpd	xmm1, xmm2, xmm1, xmm3
+	vpermilps	xmm2, xmm1, 78          # xmm2 = xmm1[2,3,0,1]
+	vxorpd	xmm3, xmm14, xmm1
+	vxorpd	xmm4, xmm14, xmm2
+	vpcmpgtq	xmm3, xmm4, xmm3
+	vblendvpd	xmm1, xmm2, xmm1, xmm3
+	vmovq	rsi, xmm0
+	vmovq	r9, xmm1
+	cmp	r11, r8
+	je	.LBB3_14
+.LBB3_4:
+	mov	rax, rsi
+	.p2align	4, 0x90
+.LBB3_5:                                # =>This Inner Loop Header: Depth=1
+	mov	rsi, qword ptr [rdi + 8*r11]
+	cmp	r9, rsi
+	cmovae	r9, rsi
+	cmp	rax, rsi
+	cmova	rsi, rax
+	add	r11, 1
+	mov	rax, rsi
+	cmp	r8, r11
+	jne	.LBB3_5
+.LBB3_14:
+	mov	qword ptr [rcx], rsi
+	mov	qword ptr [rdx], r9
+	mov	rsp, rbp
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end3:
+	.size	uint64_max_min_avx2, .Lfunc_end3-uint64_max_min_avx2
+                                        # -- End function
+	.ident	"Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
diff --git a/go/parquet/internal/utils/_lib/min_max_sse4.s b/go/parquet/internal/utils/_lib/min_max_sse4.s
new file mode 100644
index 00000000000..98f30e3ed1d
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/min_max_sse4.s
@@ -0,0 +1,613 @@
+	.text
+	.intel_syntax noprefix
+	.file	"min_max.c"
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4                               # -- Begin function int32_max_min_sse4
+.LCPI0_0:
+	.long	2147483648                      # 0x80000000
+	.long	2147483648                      # 0x80000000
+	.long	2147483648                      # 0x80000000
+	.long	2147483648                      # 0x80000000
+.LCPI0_1:
+	.long	2147483647                      # 0x7fffffff
+	.long	2147483647                      # 0x7fffffff
+	.long	2147483647                      # 0x7fffffff
+	.long	2147483647                      # 0x7fffffff
+	.text
+	.globl	int32_max_min_sse4
+	.p2align	4, 0x90
+	.type	int32_max_min_sse4,@function
+int32_max_min_sse4:                     # @int32_max_min_sse4
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	test	esi, esi
+	jle	.LBB0_1
+# %bb.2:
+	mov	r9d, esi
+	cmp	esi, 7
+	ja	.LBB0_6
+# %bb.3:
+	mov	eax, -2147483648
+	mov	r8d, 2147483647
+	xor	r11d, r11d
+	jmp	.LBB0_4
+.LBB0_1:
+	mov	r8d, 2147483647
+	mov	eax, -2147483648
+	jmp	.LBB0_13
+.LBB0_6:
+	mov	r11d, r9d
+	and	r11d, -8
+	lea	rax, [r11 - 8]
+	mov	r8, rax
+	shr	r8, 3
+	add	r8, 1
+	test	rax, rax
+	je	.LBB0_7
+# %bb.8:
+	mov	r10, r8
+	and	r10, -2
+	neg	r10
+	movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [2147483648,2147483648,2147483648,2147483648]
+	movdqa	xmm0, xmmword ptr [rip + .LCPI0_1] # xmm0 = [2147483647,2147483647,2147483647,2147483647]
+	xor	eax, eax
+	movdqa	xmm2, xmm0
+	movdqa	xmm3, xmm1
+	.p2align	4, 0x90
+.LBB0_9:                                # =>This Inner Loop Header: Depth=1
+	movdqu	xmm4, xmmword ptr [rdi + 4*rax]
+	movdqu	xmm5, xmmword ptr [rdi + 4*rax + 16]
+	movdqu	xmm6, xmmword ptr [rdi + 4*rax + 32]
+	movdqu	xmm7, xmmword ptr [rdi + 4*rax + 48]
+	pminsd	xmm0, xmm4
+	pminsd	xmm2, xmm5
+	pmaxsd	xmm1, xmm4
+	pmaxsd	xmm3, xmm5
+	pminsd	xmm0, xmm6
+	pminsd	xmm2, xmm7
+	pmaxsd	xmm1, xmm6
+	pmaxsd	xmm3, xmm7
+	add	rax, 16
+	add	r10, 2
+	jne	.LBB0_9
+# %bb.10:
+	test	r8b, 1
+	je	.LBB0_12
+.LBB0_11:
+	movdqu	xmm4, xmmword ptr [rdi + 4*rax]
+	movdqu	xmm5, xmmword ptr [rdi + 4*rax + 16]
+	pmaxsd	xmm3, xmm5
+	pmaxsd	xmm1, xmm4
+	pminsd	xmm2, xmm5
+	pminsd	xmm0, xmm4
+.LBB0_12:
+	pminsd	xmm0, xmm2
+	pmaxsd	xmm1, xmm3
+	pshufd	xmm2, xmm1, 78                  # xmm2 = xmm1[2,3,0,1]
+	pmaxsd	xmm2, xmm1
+	pshufd	xmm1, xmm2, 229                 # xmm1 = xmm2[1,1,2,3]
+	pmaxsd	xmm1, xmm2
+	movd	eax, xmm1
+	pshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
+	pminsd	xmm1, xmm0
+	pshufd	xmm0, xmm1, 229                 # xmm0 = xmm1[1,1,2,3]
+	pminsd	xmm0, xmm1
+	movd	r8d, xmm0
+	cmp	r11, r9
+	je	.LBB0_13
+.LBB0_4:
+	mov	esi, eax
+	.p2align	4, 0x90
+.LBB0_5:                                # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rdi + 4*r11]
+	cmp	r8d, eax
+	cmovg	r8d, eax
+	cmp	esi, eax
+	cmovge	eax, esi
+	add	r11, 1
+	mov	esi, eax
+	cmp	r9, r11
+	jne	.LBB0_5
+.LBB0_13:
+	mov	dword ptr [rcx], eax
+	mov	dword ptr [rdx], r8d
+	mov	rsp, rbp
+	pop	rbp
+	ret
+.LBB0_7:
+	movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [2147483648,2147483648,2147483648,2147483648]
+	movdqa	xmm0, xmmword ptr [rip + .LCPI0_1] # xmm0 = [2147483647,2147483647,2147483647,2147483647]
+	xor	eax, eax
+	movdqa	xmm2, xmm0
+	movdqa	xmm3, xmm1
+	test	r8b, 1
+	jne	.LBB0_11
+	jmp	.LBB0_12
+.Lfunc_end0:
+	.size	int32_max_min_sse4, .Lfunc_end0-int32_max_min_sse4
+                                        # -- End function
+	.globl	uint32_max_min_sse4             # -- Begin function uint32_max_min_sse4
+	.p2align	4, 0x90
+	.type	uint32_max_min_sse4,@function
+uint32_max_min_sse4:                    # @uint32_max_min_sse4
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	test	esi, esi
+	jle	.LBB1_1
+# %bb.2:
+	mov	r9d, esi
+	cmp	esi, 7
+	ja	.LBB1_6
+# %bb.3:
+	xor	r11d, r11d
+	mov	r8d, -1
+	xor	esi, esi
+	jmp	.LBB1_4
+.LBB1_1:
+	mov	r8d, -1
+	xor	esi, esi
+	jmp	.LBB1_13
+.LBB1_6:
+	mov	r11d, r9d
+	and	r11d, -8
+	lea	rax, [r11 - 8]
+	mov	r8, rax
+	shr	r8, 3
+	add	r8, 1
+	test	rax, rax
+	je	.LBB1_7
+# %bb.8:
+	mov	r10, r8
+	and	r10, -2
+	neg	r10
+	pxor	xmm1, xmm1
+	pcmpeqd	xmm0, xmm0
+	xor	eax, eax
+	pcmpeqd	xmm2, xmm2
+	pxor	xmm3, xmm3
+	.p2align	4, 0x90
+.LBB1_9:                                # =>This Inner Loop Header: Depth=1
+	movdqu	xmm4, xmmword ptr [rdi + 4*rax]
+	movdqu	xmm5, xmmword ptr [rdi + 4*rax + 16]
+	movdqu	xmm6, xmmword ptr [rdi + 4*rax + 32]
+	movdqu	xmm7, xmmword ptr [rdi + 4*rax + 48]
+	pminud	xmm0, xmm4
+	pminud	xmm2, xmm5
+	pmaxud	xmm1, xmm4
+	pmaxud	xmm3, xmm5
+	pminud	xmm0, xmm6
+	pminud	xmm2, xmm7
+	pmaxud	xmm1, xmm6
+	pmaxud	xmm3, xmm7
+	add	rax, 16
+	add	r10, 2
+	jne	.LBB1_9
+# %bb.10:
+	test	r8b, 1
+	je	.LBB1_12
+.LBB1_11:
+	movdqu	xmm4, xmmword ptr [rdi + 4*rax]
+	movdqu	xmm5, xmmword ptr [rdi + 4*rax + 16]
+	pmaxud	xmm3, xmm5
+	pmaxud	xmm1, xmm4
+	pminud	xmm2, xmm5
+	pminud	xmm0, xmm4
+.LBB1_12:
+	pminud	xmm0, xmm2
+	pmaxud	xmm1, xmm3
+	pshufd	xmm2, xmm1, 78                  # xmm2 = xmm1[2,3,0,1]
+	pmaxud	xmm2, xmm1
+	pshufd	xmm1, xmm2, 229                 # xmm1 = xmm2[1,1,2,3]
+	pmaxud	xmm1, xmm2
+	movd	esi, xmm1
+	pshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
+	pminud	xmm1, xmm0
+	pshufd	xmm0, xmm1, 229                 # xmm0 = xmm1[1,1,2,3]
+	pminud	xmm0, xmm1
+	movd	r8d, xmm0
+	cmp	r11, r9
+	je	.LBB1_13
+.LBB1_4:
+	mov	eax, esi
+	.p2align	4, 0x90
+.LBB1_5:                                # =>This Inner Loop Header: Depth=1
+	mov	esi, dword ptr [rdi + 4*r11]
+	cmp	r8d, esi
+	cmovae	r8d, esi
+	cmp	eax, esi
+	cmova	esi, eax
+	add	r11, 1
+	mov	eax, esi
+	cmp	r9, r11
+	jne	.LBB1_5
+.LBB1_13:
+	mov	dword ptr [rcx], esi
+	mov	dword ptr [rdx], r8d
+	mov	rsp, rbp
+	pop	rbp
+	ret
+.LBB1_7:
+	pxor	xmm1, xmm1
+	pcmpeqd	xmm0, xmm0
+	xor	eax, eax
+	pcmpeqd	xmm2, xmm2
+	pxor	xmm3, xmm3
+	test	r8b, 1
+	jne	.LBB1_11
+	jmp	.LBB1_12
+.Lfunc_end1:
+	.size	uint32_max_min_sse4, .Lfunc_end1-uint32_max_min_sse4
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4                               # -- Begin function int64_max_min_sse4
+.LCPI2_0:
+	.quad	-9223372036854775808            # 0x8000000000000000
+	.quad	-9223372036854775808            # 0x8000000000000000
+.LCPI2_1:
+	.quad	9223372036854775807             # 0x7fffffffffffffff
+	.quad	9223372036854775807             # 0x7fffffffffffffff
+	.text
+	.globl	int64_max_min_sse4
+	.p2align	4, 0x90
+	.type	int64_max_min_sse4,@function
+int64_max_min_sse4:                     # @int64_max_min_sse4
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	movabs	r8, 9223372036854775807
+	test	esi, esi
+	jle	.LBB2_1
+# %bb.2:
+	mov	r9d, esi
+	cmp	esi, 3
+	ja	.LBB2_6
+# %bb.3:
+	lea	rsi, [r8 + 1]
+	xor	r11d, r11d
+	jmp	.LBB2_4
+.LBB2_1:
+	lea	rsi, [r8 + 1]
+	jmp	.LBB2_13
+.LBB2_6:
+	mov	r11d, r9d
+	and	r11d, -4
+	lea	rax, [r11 - 4]
+	mov	r8, rax
+	shr	r8, 2
+	add	r8, 1
+	test	rax, rax
+	je	.LBB2_7
+# %bb.8:
+	mov	r10, r8
+	and	r10, -2
+	neg	r10
+	movdqa	xmm9, xmmword ptr [rip + .LCPI2_0] # xmm9 = [9223372036854775808,9223372036854775808]
+	movdqa	xmm8, xmmword ptr [rip + .LCPI2_1] # xmm8 = [9223372036854775807,9223372036854775807]
+	xor	eax, eax
+	movdqa	xmm2, xmm8
+	movdqa	xmm6, xmm9
+	.p2align	4, 0x90
+.LBB2_9:                                # =>This Inner Loop Header: Depth=1
+	movdqu	xmm7, xmmword ptr [rdi + 8*rax]
+	movdqa	xmm0, xmm7
+	pcmpgtq	xmm0, xmm8
+	movdqa	xmm4, xmm7
+	blendvpd	xmm4, xmm8, xmm0
+	movdqu	xmm1, xmmword ptr [rdi + 8*rax + 16]
+	movdqa	xmm0, xmm1
+	pcmpgtq	xmm0, xmm2
+	movdqa	xmm5, xmm1
+	blendvpd	xmm5, xmm2, xmm0
+	movdqa	xmm0, xmm9
+	pcmpgtq	xmm0, xmm7
+	blendvpd	xmm7, xmm9, xmm0
+	movdqa	xmm0, xmm6
+	pcmpgtq	xmm0, xmm1
+	blendvpd	xmm1, xmm6, xmm0
+	movdqu	xmm3, xmmword ptr [rdi + 8*rax + 32]
+	movdqa	xmm0, xmm3
+	pcmpgtq	xmm0, xmm4
+	movdqa	xmm8, xmm3
+	blendvpd	xmm8, xmm4, xmm0
+	movdqu	xmm4, xmmword ptr [rdi + 8*rax + 48]
+	movdqa	xmm0, xmm4
+	pcmpgtq	xmm0, xmm5
+	movdqa	xmm2, xmm4
+	blendvpd	xmm2, xmm5, xmm0
+	movapd	xmm0, xmm7
+	pcmpgtq	xmm0, xmm3
+	blendvpd	xmm3, xmm7, xmm0
+	movapd	xmm0, xmm1
+	pcmpgtq	xmm0, xmm4
+	blendvpd	xmm4, xmm1, xmm0
+	add	rax, 8
+	movapd	xmm9, xmm3
+	movapd	xmm6, xmm4
+	add	r10, 2
+	jne	.LBB2_9
+# %bb.10:
+	test	r8b, 1
+	je	.LBB2_12
+.LBB2_11:
+	movdqu	xmm1, xmmword ptr [rdi + 8*rax + 16]
+	movapd	xmm0, xmm4
+	pcmpgtq	xmm0, xmm1
+	movdqa	xmm5, xmm1
+	blendvpd	xmm5, xmm4, xmm0
+	movdqu	xmm4, xmmword ptr [rdi + 8*rax]
+	movapd	xmm0, xmm3
+	pcmpgtq	xmm0, xmm4
+	movdqa	xmm6, xmm4
+	blendvpd	xmm6, xmm3, xmm0
+	movdqa	xmm0, xmm1
+	pcmpgtq	xmm0, xmm2
+	blendvpd	xmm1, xmm2, xmm0
+	movdqa	xmm0, xmm4
+	pcmpgtq	xmm0, xmm8
+	blendvpd	xmm4, xmm8, xmm0
+	movapd	xmm8, xmm4
+	movapd	xmm2, xmm1
+	movapd	xmm3, xmm6
+	movapd	xmm4, xmm5
+.LBB2_12:
+	movapd	xmm0, xmm3
+	pcmpgtq	xmm0, xmm4
+	blendvpd	xmm4, xmm3, xmm0
+	pshufd	xmm1, xmm4, 78                  # xmm1 = xmm4[2,3,0,1]
+	movdqa	xmm0, xmm4
+	pcmpgtq	xmm0, xmm1
+	blendvpd	xmm1, xmm4, xmm0
+	movq	rsi, xmm1
+	movdqa	xmm0, xmm2
+	pcmpgtq	xmm0, xmm8
+	blendvpd	xmm2, xmm8, xmm0
+	pshufd	xmm1, xmm2, 78                  # xmm1 = xmm2[2,3,0,1]
+	movdqa	xmm0, xmm1
+	pcmpgtq	xmm0, xmm2
+	blendvpd	xmm1, xmm2, xmm0
+	movq	r8, xmm1
+	cmp	r11, r9
+	je	.LBB2_13
+.LBB2_4:
+	mov	rax, rsi
+	.p2align	4, 0x90
+.LBB2_5:                                # =>This Inner Loop Header: Depth=1
+	mov	rsi, qword ptr [rdi + 8*r11]
+	cmp	r8, rsi
+	cmovg	r8, rsi
+	cmp	rax, rsi
+	cmovge	rsi, rax
+	add	r11, 1
+	mov	rax, rsi
+	cmp	r9, r11
+	jne	.LBB2_5
+.LBB2_13:
+	mov	qword ptr [rcx], rsi
+	mov	qword ptr [rdx], r8
+	mov	rsp, rbp
+	pop	rbp
+	ret
+.LBB2_7:
+	movapd	xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [9223372036854775808,9223372036854775808]
+	movdqa	xmm8, xmmword ptr [rip + .LCPI2_1] # xmm8 = [9223372036854775807,9223372036854775807]
+	xor	eax, eax
+	movdqa	xmm2, xmm8
+	movapd	xmm4, xmm3
+	test	r8b, 1
+	jne	.LBB2_11
+	jmp	.LBB2_12
+.Lfunc_end2:
+	.size	int64_max_min_sse4, .Lfunc_end2-int64_max_min_sse4
+                                        # -- End function
+	.section	.rodata.cst16,"aM",@progbits,16
+	.p2align	4                               # -- Begin function uint64_max_min_sse4
+.LCPI3_0:
+	.quad	-9223372036854775808            # 0x8000000000000000
+	.quad	-9223372036854775808            # 0x8000000000000000
+	.text
+	.globl	uint64_max_min_sse4
+	.p2align	4, 0x90
+	.type	uint64_max_min_sse4,@function
+uint64_max_min_sse4:                    # @uint64_max_min_sse4
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	test	esi, esi
+	jle	.LBB3_1
+# %bb.2:
+	mov	r9d, esi
+	cmp	esi, 3
+	ja	.LBB3_6
+# %bb.3:
+	mov	r8, -1
+	xor	r11d, r11d
+	xor	eax, eax
+	jmp	.LBB3_4
+.LBB3_1:
+	mov	r8, -1
+	xor	eax, eax
+	jmp	.LBB3_13
+.LBB3_6:
+	mov	r11d, r9d
+	and	r11d, -4
+	lea	rax, [r11 - 4]
+	mov	r8, rax
+	shr	r8, 2
+	add	r8, 1
+	test	rax, rax
+	je	.LBB3_7
+# %bb.8:
+	mov	r10, r8
+	and	r10, -2
+	neg	r10
+	pxor	xmm9, xmm9
+	pcmpeqd	xmm10, xmm10
+	xor	eax, eax
+	movdqa	xmm8, xmmword ptr [rip + .LCPI3_0] # xmm8 = [9223372036854775808,9223372036854775808]
+	pcmpeqd	xmm11, xmm11
+	pxor	xmm12, xmm12
+	.p2align	4, 0x90
+.LBB3_9:                                # =>This Inner Loop Header: Depth=1
+	movdqa	xmm2, xmm10
+	pxor	xmm2, xmm8
+	movdqu	xmm4, xmmword ptr [rdi + 8*rax]
+	movdqu	xmm5, xmmword ptr [rdi + 8*rax + 16]
+	movdqu	xmm13, xmmword ptr [rdi + 8*rax + 32]
+	movdqa	xmm0, xmm4
+	pxor	xmm0, xmm8
+	movdqa	xmm1, xmm9
+	pxor	xmm1, xmm8
+	pcmpgtq	xmm1, xmm0
+	pcmpgtq	xmm0, xmm2
+	movdqa	xmm3, xmm4
+	blendvpd	xmm3, xmm10, xmm0
+	movdqu	xmm6, xmmword ptr [rdi + 8*rax + 48]
+	movdqa	xmm7, xmm11
+	pxor	xmm7, xmm8
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqa	xmm2, xmm12
+	pxor	xmm2, xmm8
+	pcmpgtq	xmm2, xmm0
+	pcmpgtq	xmm0, xmm7
+	movdqa	xmm7, xmm5
+	blendvpd	xmm7, xmm11, xmm0
+	movdqa	xmm0, xmm1
+	blendvpd	xmm4, xmm9, xmm0
+	movdqa	xmm0, xmm2
+	blendvpd	xmm5, xmm12, xmm0
+	movapd	xmm2, xmm3
+	xorpd	xmm2, xmm8
+	movdqa	xmm0, xmm13
+	pxor	xmm0, xmm8
+	movapd	xmm1, xmm4
+	xorpd	xmm1, xmm8
+	pcmpgtq	xmm1, xmm0
+	pcmpgtq	xmm0, xmm2
+	movdqa	xmm10, xmm13
+	blendvpd	xmm10, xmm3, xmm0
+	movapd	xmm3, xmm7
+	xorpd	xmm3, xmm8
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm8
+	movapd	xmm2, xmm5
+	xorpd	xmm2, xmm8
+	pcmpgtq	xmm2, xmm0
+	pcmpgtq	xmm0, xmm3
+	movdqa	xmm11, xmm6
+	blendvpd	xmm11, xmm7, xmm0
+	movdqa	xmm0, xmm1
+	blendvpd	xmm13, xmm4, xmm0
+	movdqa	xmm0, xmm2
+	blendvpd	xmm6, xmm5, xmm0
+	add	rax, 8
+	movapd	xmm9, xmm13
+	movapd	xmm12, xmm6
+	add	r10, 2
+	jne	.LBB3_9
+# %bb.10:
+	test	r8b, 1
+	je	.LBB3_12
+.LBB3_11:
+	movupd	xmm4, xmmword ptr [rdi + 8*rax]
+	movupd	xmm3, xmmword ptr [rdi + 8*rax + 16]
+	movapd	xmm5, xmmword ptr [rip + .LCPI3_0] # xmm5 = [9223372036854775808,9223372036854775808]
+	movapd	xmm0, xmm6
+	xorpd	xmm0, xmm5
+	movapd	xmm1, xmm3
+	xorpd	xmm1, xmm5
+	pcmpgtq	xmm0, xmm1
+	movapd	xmm7, xmm3
+	blendvpd	xmm7, xmm6, xmm0
+	movapd	xmm0, xmm13
+	xorpd	xmm0, xmm5
+	movapd	xmm2, xmm4
+	xorpd	xmm2, xmm5
+	pcmpgtq	xmm0, xmm2
+	movapd	xmm6, xmm4
+	blendvpd	xmm6, xmm13, xmm0
+	movapd	xmm0, xmm11
+	xorpd	xmm0, xmm5
+	pcmpgtq	xmm1, xmm0
+	movdqa	xmm0, xmm1
+	blendvpd	xmm3, xmm11, xmm0
+	xorpd	xmm5, xmm10
+	pcmpgtq	xmm2, xmm5
+	movdqa	xmm0, xmm2
+	blendvpd	xmm4, xmm10, xmm0
+	movapd	xmm10, xmm4
+	movapd	xmm11, xmm3
+	movapd	xmm13, xmm6
+	movapd	xmm6, xmm7
+.LBB3_12:
+	movapd	xmm1, xmmword ptr [rip + .LCPI3_0] # xmm1 = [9223372036854775808,9223372036854775808]
+	movapd	xmm2, xmm6
+	xorpd	xmm2, xmm1
+	movapd	xmm0, xmm13
+	xorpd	xmm0, xmm1
+	pcmpgtq	xmm0, xmm2
+	blendvpd	xmm6, xmm13, xmm0
+	pshufd	xmm2, xmm6, 78                  # xmm2 = xmm6[2,3,0,1]
+	movapd	xmm0, xmm6
+	xorpd	xmm0, xmm1
+	movdqa	xmm3, xmm2
+	pxor	xmm3, xmm1
+	pcmpgtq	xmm0, xmm3
+	blendvpd	xmm2, xmm6, xmm0
+	movq	rax, xmm2
+	movdqa	xmm2, xmm10
+	pxor	xmm2, xmm1
+	movdqa	xmm0, xmm11
+	pxor	xmm0, xmm1
+	pcmpgtq	xmm0, xmm2
+	blendvpd	xmm11, xmm10, xmm0
+	pshufd	xmm2, xmm11, 78                 # xmm2 = xmm11[2,3,0,1]
+	movdqa	xmm0, xmm11
+	pxor	xmm0, xmm1
+	pxor	xmm1, xmm2
+	pcmpgtq	xmm1, xmm0
+	movdqa	xmm0, xmm1
+	blendvpd	xmm2, xmm11, xmm0
+	movq	r8, xmm2
+	cmp	r11, r9
+	je	.LBB3_13
+.LBB3_4:
+	mov	rsi, rax
+	.p2align	4, 0x90
+.LBB3_5:                                # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rdi + 8*r11]
+	cmp	r8, rax
+	cmovae	r8, rax
+	cmp	rsi, rax
+	cmova	rax, rsi
+	add	r11, 1
+	mov	rsi, rax
+	cmp	r9, r11
+	jne	.LBB3_5
+.LBB3_13:
+	mov	qword ptr [rcx], rax
+	mov	qword ptr [rdx], r8
+	mov	rsp, rbp
+	pop	rbp
+	ret
+.LBB3_7:
+	xorpd	xmm13, xmm13
+	pcmpeqd	xmm10, xmm10
+	xor	eax, eax
+	pcmpeqd	xmm11, xmm11
+	xorpd	xmm6, xmm6
+	test	r8b, 1
+	jne	.LBB3_11
+	jmp	.LBB3_12
+.Lfunc_end3:
+	.size	uint64_max_min_sse4, .Lfunc_end3-uint64_max_min_sse4
+                                        # -- End function
+	.ident	"Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
diff --git a/go/parquet/internal/utils/_lib/unpack_bool.c b/go/parquet/internal/utils/_lib/unpack_bool.c
new file mode 100644
index 00000000000..b47e0037ee3
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/unpack_bool.c
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arch.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+void FULL_NAME(bytes_to_bools)(const uint8_t bytes[], const int len, bool out[], const int outlen) {
+  for (int i = 0; i < len; i++) {
+    for (int j = 0; j < 8; j++) {
+      int idx = 8*i+j;
+      if (idx >= outlen) { break; }
+      out[idx] = (bytes[i] & (1 << j)) != 0;
+    }
+  }
+}
+
diff --git a/go/parquet/internal/utils/_lib/unpack_bool_avx2.s b/go/parquet/internal/utils/_lib/unpack_bool_avx2.s
new file mode 100644
index 00000000000..1bc1be53d4d
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/unpack_bool_avx2.s
@@ -0,0 +1,6293 @@
+	.text
+	.intel_syntax noprefix
+	.file	"unpack_bool.c"
+	.section	.rodata.cst32,"aM",@progbits,32
+	.p2align	5                               # -- Begin function bytes_to_bools_avx2
+.LCPI0_0:
+	.long	24                              # 0x18
+	.long	25                              # 0x19
+	.long	26                              # 0x1a
+	.long	27                              # 0x1b
+	.long	28                              # 0x1c
+	.long	29                              # 0x1d
+	.long	30                              # 0x1e
+	.long	31                              # 0x1f
+.LCPI0_1:
+	.long	16                              # 0x10
+	.long	17                              # 0x11
+	.long	18                              # 0x12
+	.long	19                              # 0x13
+	.long	20                              # 0x14
+	.long	21                              # 0x15
+	.long	22                              # 0x16
+	.long	23                              # 0x17
+.LCPI0_2:
+	.long	8                               # 0x8
+	.long	9                               # 0x9
+	.long	10                              # 0xa
+	.long	11                              # 0xb
+	.long	12                              # 0xc
+	.long	13                              # 0xd
+	.long	14                              # 0xe
+	.long	15                              # 0xf
+.LCPI0_3:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	2                               # 0x2
+	.long	3                               # 0x3
+	.long	4                               # 0x4
+	.long	5                               # 0x5
+	.long	6                               # 0x6
+	.long	7                               # 0x7
+.LCPI0_4:
+	.zero	32,1
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3
+.LCPI0_5:
+	.quad	1                               # 0x1
+.LCPI0_6:
+	.quad	2                               # 0x2
+.LCPI0_7:
+	.quad	3                               # 0x3
+.LCPI0_8:
+	.quad	4                               # 0x4
+.LCPI0_9:
+	.quad	5                               # 0x5
+.LCPI0_10:
+	.quad	6                               # 0x6
+.LCPI0_11:
+	.quad	7                               # 0x7
+	.section	.rodata.cst4,"aM",@progbits,4
+	.p2align	2
+.LCPI0_12:
+	.long	32                              # 0x20
+	.text
+	.globl	bytes_to_bools_avx2
+	.p2align	4, 0x90
+	.type	bytes_to_bools_avx2,@function
+bytes_to_bools_avx2:                    # @bytes_to_bools_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	r15
+	push	r14
+	push	r13
+	push	r12
+	push	rbx
+	and	rsp, -32
+	sub	rsp, 960
+	test	esi, esi
+	jle	.LBB0_1051
+# %bb.1:
+	mov	r9d, ecx
+	mov	r8, rdx
+	mov	r10d, esi
+	cmp	esi, 32
+	jae	.LBB0_3
+.LBB0_2:
+	xor	r12d, r12d
+.LBB0_1055:
+	lea	ecx, [8*r12]
+	jmp	.LBB0_1057
+	.p2align	4, 0x90
+.LBB0_1056:                             #   in Loop: Header=BB0_1057 Depth=1
+	add	r12, 1
+	add	ecx, 8
+	cmp	r10, r12
+	je	.LBB0_1051
+.LBB0_1057:                             # =>This Inner Loop Header: Depth=1
+	mov	edx, ecx
+	mov	ecx, ecx
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1058:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	edx, byte ptr [rdi + r12]
+	and	dl, 1
+	mov	byte ptr [r8 + rcx], dl
+	mov	rdx, rcx
+	or	rdx, 1
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1059:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	ebx, byte ptr [rdi + r12]
+	shr	bl
+	and	bl, 1
+	mov	byte ptr [r8 + rdx], bl
+	mov	rdx, rcx
+	or	rdx, 2
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1060:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	ebx, byte ptr [rdi + r12]
+	shr	bl, 2
+	and	bl, 1
+	mov	byte ptr [r8 + rdx], bl
+	mov	rdx, rcx
+	or	rdx, 3
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1061:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	ebx, byte ptr [rdi + r12]
+	shr	bl, 3
+	and	bl, 1
+	mov	byte ptr [r8 + rdx], bl
+	mov	rdx, rcx
+	or	rdx, 4
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1062:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	ebx, byte ptr [rdi + r12]
+	shr	bl, 4
+	and	bl, 1
+	mov	byte ptr [r8 + rdx], bl
+	mov	rdx, rcx
+	or	rdx, 5
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1063:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	ebx, byte ptr [rdi + r12]
+	shr	bl, 5
+	and	bl, 1
+	mov	byte ptr [r8 + rdx], bl
+	mov	rdx, rcx
+	or	rdx, 6
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1064:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	ebx, byte ptr [rdi + r12]
+	shr	bl, 6
+	and	bl, 1
+	mov	byte ptr [r8 + rdx], bl
+	mov	rdx, rcx
+	or	rdx, 7
+	cmp	edx, r9d
+	jge	.LBB0_1056
+# %bb.1065:                             #   in Loop: Header=BB0_1057 Depth=1
+	movzx	ebx, byte ptr [rdi + r12]
+	shr	bl, 7
+	mov	byte ptr [r8 + rdx], bl
+	jmp	.LBB0_1056
+.LBB0_3:
+	mov	dword ptr [rsp + 16], r9d       # 4-byte Spill
+	mov	qword ptr [rsp + 48], r10       # 8-byte Spill
+	lea	rsi, [r10 - 1]
+	mov	ecx, 8
+	mov	eax, esi
+	mul	ecx
+	seto	r14b
+	mov	rbx, rsi
+	shr	rbx, 32
+	lea	rcx, [r8 + 6]
+	mov	edx, 8
+	mov	rax, rsi
+	mul	rdx
+	seto	sil
+	add	rcx, rax
+	setb	dl
+	lea	rcx, [r8 + 7]
+	add	rcx, rax
+	setb	r13b
+	lea	rcx, [r8 + 5]
+	add	rcx, rax
+	setb	r9b
+	lea	rcx, [r8 + 4]
+	add	rcx, rax
+	setb	r15b
+	lea	rcx, [r8 + 3]
+	add	rcx, rax
+	setb	r11b
+	lea	rcx, [r8 + 2]
+	add	rcx, rax
+	setb	r10b
+	lea	rcx, [r8 + 1]
+	add	rcx, rax
+	setb	cl
+	add	rax, r8
+	setb	al
+	xor	r12d, r12d
+	test	rbx, rbx
+	jne	.LBB0_1052
+# %bb.4:
+	test	r14b, r14b
+	jne	.LBB0_1052
+# %bb.5:
+	test	dl, dl
+	jne	.LBB0_1052
+# %bb.6:
+	test	sil, sil
+	jne	.LBB0_1052
+# %bb.7:
+	test	r13b, r13b
+	jne	.LBB0_1052
+# %bb.8:
+	test	sil, sil
+	jne	.LBB0_1052
+# %bb.9:
+	test	r9b, r9b
+	jne	.LBB0_1052
+# %bb.10:
+	test	sil, sil
+	jne	.LBB0_1052
+# %bb.11:
+	test	r15b, r15b
+	jne	.LBB0_1052
+# %bb.12:
+	test	sil, sil
+	jne	.LBB0_1052
+# %bb.13:
+	test	r11b, r11b
+	jne	.LBB0_1052
+# %bb.14:
+	test	sil, sil
+	jne	.LBB0_1052
+# %bb.15:
+	test	r10b, r10b
+	jne	.LBB0_1052
+# %bb.16:
+	test	sil, sil
+	mov	r10, qword ptr [rsp + 48]       # 8-byte Reload
+	jne	.LBB0_1054
+# %bb.17:
+	test	cl, cl
+	jne	.LBB0_1054
+# %bb.18:
+	test	sil, sil
+	mov	r9d, dword ptr [rsp + 16]       # 4-byte Reload
+	jne	.LBB0_1055
+# %bb.19:
+	test	al, al
+	jne	.LBB0_1055
+# %bb.20:
+	test	sil, sil
+	jne	.LBB0_1055
+# %bb.21:
+	lea	rax, [r8 + 8*r10]
+	cmp	rax, rdi
+	jbe	.LBB0_24
+# %bb.22:
+	lea	rax, [rdi + r10]
+	cmp	rax, r8
+	ja	.LBB0_2
+.LBB0_24:
+	mov	r12d, r10d
+	and	r12d, -32
+	vmovd	xmm0, r9d
+	vpbroadcastd	ymm0, xmm0
+	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_0] # ymm9 = [24,25,26,27,28,29,30,31]
+	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [16,17,18,19,20,21,22,23]
+	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_2] # ymm3 = [8,9,10,11,12,13,14,15]
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_3] # ymm2 = [0,1,2,3,4,5,6,7]
+	xor	r11d, r11d
+	vbroadcastsd	ymm1, qword ptr [rip + .LCPI0_5] # ymm1 = [1,1,1,1]
+	vmovaps	ymmword ptr [rsp + 768], ymm1   # 32-byte Spill
+	vbroadcastsd	ymm1, qword ptr [rip + .LCPI0_6] # ymm1 = [2,2,2,2]
+	vmovaps	ymmword ptr [rsp + 736], ymm1   # 32-byte Spill
+	vbroadcastsd	ymm1, qword ptr [rip + .LCPI0_7] # ymm1 = [3,3,3,3]
+	vmovaps	ymmword ptr [rsp + 704], ymm1   # 32-byte Spill
+	vbroadcastsd	ymm1, qword ptr [rip + .LCPI0_8] # ymm1 = [4,4,4,4]
+	vmovaps	ymmword ptr [rsp + 672], ymm1   # 32-byte Spill
+	vbroadcastsd	ymm1, qword ptr [rip + .LCPI0_9] # ymm1 = [5,5,5,5]
+	vmovaps	ymmword ptr [rsp + 640], ymm1   # 32-byte Spill
+	vbroadcastsd	ymm1, qword ptr [rip + .LCPI0_10] # ymm1 = [6,6,6,6]
+	vmovaps	ymmword ptr [rsp + 608], ymm1   # 32-byte Spill
+	vbroadcastsd	ymm1, qword ptr [rip + .LCPI0_11] # ymm1 = [7,7,7,7]
+	vmovaps	ymmword ptr [rsp + 576], ymm1   # 32-byte Spill
+	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_12] # ymm1 = [32,32,32,32,32,32,32,32]
+	vmovdqa	ymmword ptr [rsp + 544], ymm1   # 32-byte Spill
+	jmp	.LBB0_26
+	.p2align	4, 0x90
+.LBB0_25:                               #   in Loop: Header=BB0_26 Depth=1
+	add	r11, 32
+	vmovdqa	ymm1, ymmword ptr [rsp + 544]   # 32-byte Reload
+	vpaddd	ymm2, ymm2, ymm1
+	vpaddd	ymm3, ymm3, ymm1
+	vpaddd	ymm8, ymm8, ymm1
+	vpaddd	ymm9, ymm9, ymm1
+	cmp	r11, r12
+	je	.LBB0_1050
+.LBB0_26:                               # =>This Inner Loop Header: Depth=1
+	vmovdqa	ymmword ptr [rsp + 800], ymm2   # 32-byte Spill
+	vpslld	ymm1, ymm2, 3
+	vpcmpgtd	xmm2, xmm0, xmm1
+	vmovd	ecx, xmm2
+                                        # implicit-def: $ymm4
+	test	cl, 1
+	je	.LBB0_28
+# %bb.27:                               #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm4, byte ptr [rdi + r11]
+.LBB0_28:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r10, r11
+	or	r10, 1
+	vpcmpgtd	xmm2, xmm0, xmm1
+	vpackssdw	xmm2, xmm2, xmm2
+	vpacksswb	xmm2, xmm2, xmm2
+	vpextrb	ecx, xmm2, 1
+	test	cl, 1
+	je	.LBB0_30
+# %bb.29:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm2, xmm4, byte ptr [rdi + r10], 1
+	vpblendd	ymm4, ymm4, ymm2, 15            # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_30:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r14, r11
+	or	r14, 2
+	vpcmpgtd	xmm2, xmm0, xmm1
+	vpackssdw	xmm2, xmm2, xmm2
+	vpacksswb	xmm2, xmm2, xmm2
+	vpextrb	ecx, xmm2, 2
+	test	cl, 1
+	je	.LBB0_32
+# %bb.31:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm2, xmm4, byte ptr [rdi + r14], 2
+	vpblendd	ymm4, ymm4, ymm2, 15            # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_32:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm13, ymm1, 1
+	mov	rdx, r11
+	or	rdx, 3
+	vpcmpgtd	xmm2, xmm0, xmm1
+	vpackssdw	xmm2, xmm2, xmm2
+	vpacksswb	xmm2, xmm2, xmm2
+	vpextrb	ecx, xmm2, 3
+	test	cl, 1
+	je	.LBB0_34
+# %bb.33:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm2, xmm4, byte ptr [rdi + rdx], 3
+	vpblendd	ymm4, ymm4, ymm2, 15            # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_34:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, r11
+	or	rcx, 4
+	vextracti128	xmm7, ymm0, 1
+	vpcmpgtd	xmm2, xmm7, xmm13
+	vpextrb	r9d, xmm2, 0
+	test	r9b, 1
+	mov	qword ptr [rsp + 272], rdx      # 8-byte Spill
+	mov	qword ptr [rsp + 264], rcx      # 8-byte Spill
+	je	.LBB0_36
+# %bb.35:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm2, xmm4, byte ptr [rdi + rcx], 4
+	vpblendd	ymm4, ymm4, ymm2, 15            # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_36:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r15, r11
+	or	r15, 5
+	vpcmpgtd	ymm6, ymm0, ymm1
+	vpackssdw	ymm2, ymm6, ymm0
+	vextracti128	xmm2, ymm2, 1
+	vpbroadcastd	xmm2, xmm2
+	vpacksswb	xmm2, xmm2, xmm2
+	vpextrb	ecx, xmm2, 5
+	test	cl, 1
+	je	.LBB0_38
+# %bb.37:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm2, xmm4, byte ptr [rdi + r15], 5
+	vpblendd	ymm4, ymm4, ymm2, 15            # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_38:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 6
+	vpackssdw	ymm2, ymm6, ymm0
+	vpermq	ymm2, ymm2, 232                 # ymm2 = ymm2[0,2,2,3]
+	vpacksswb	xmm2, xmm2, xmm2
+	vpextrb	ecx, xmm2, 6
+	test	cl, 1
+	je	.LBB0_40
+# %bb.39:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm2, xmm4, byte ptr [rdi + rbx], 6
+	vpblendd	ymm4, ymm4, ymm2, 15            # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_40:                               #   in Loop: Header=BB0_26 Depth=1
+	vpslld	ymm2, ymm3, 3
+	mov	rax, r11
+	or	rax, 7
+	vpackssdw	ymm5, ymm6, ymm0
+	vpermq	ymm5, ymm5, 232                 # ymm5 = ymm5[0,2,2,3]
+	vpacksswb	xmm5, xmm5, xmm5
+	vpextrb	ecx, xmm5, 7
+	test	cl, 1
+	je	.LBB0_42
+# %bb.41:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm5, xmm4, byte ptr [rdi + rax], 7
+	vpblendd	ymm4, ymm4, ymm5, 15            # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_42:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, r11
+	or	rsi, 8
+	vpcmpgtd	xmm5, xmm0, xmm2
+	vpextrb	ecx, xmm5, 0
+	test	cl, 1
+	je	.LBB0_44
+# %bb.43:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm5, xmm4, byte ptr [rdi + rsi], 8
+	vpblendd	ymm4, ymm4, ymm5, 15            # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_44:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, r11
+	or	rdx, 9
+	vpcmpgtd	xmm5, xmm0, xmm2
+	vpackssdw	xmm5, xmm5, xmm5
+	vpacksswb	xmm5, xmm5, xmm5
+	vpextrb	ecx, xmm5, 9
+	test	cl, 1
+	mov	qword ptr [rsp + 224], rdx      # 8-byte Spill
+	je	.LBB0_46
+# %bb.45:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm5, xmm4, byte ptr [rdi + rdx], 9
+	vpblendd	ymm4, ymm4, ymm5, 15            # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_46:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, r11
+	or	rdx, 10
+	vpcmpgtd	xmm5, xmm0, xmm2
+	vpackssdw	xmm5, xmm5, xmm5
+	vpacksswb	xmm5, xmm5, xmm5
+	vpextrb	ecx, xmm5, 10
+	test	cl, 1
+	vmovdqa	ymmword ptr [rsp + 832], ymm3   # 32-byte Spill
+	mov	qword ptr [rsp + 96], rsi       # 8-byte Spill
+	je	.LBB0_48
+# %bb.47:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm5, xmm4, byte ptr [rdi + rdx], 10
+	vpblendd	ymm4, ymm4, ymm5, 15            # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_48:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm5, ymm2, 1
+	mov	rsi, r11
+	or	rsi, 11
+	vpcmpgtd	xmm3, xmm0, xmm2
+	vpackssdw	xmm3, xmm3, xmm3
+	vpacksswb	xmm3, xmm3, xmm3
+	vpextrb	ecx, xmm3, 11
+	test	cl, 1
+	mov	qword ptr [rsp + 152], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 296], r14      # 8-byte Spill
+	mov	qword ptr [rsp + 104], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 288], rbx      # 8-byte Spill
+	mov	qword ptr [rsp + 232], rax      # 8-byte Spill
+	je	.LBB0_50
+# %bb.49:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm3, xmm4, byte ptr [rdi + rsi], 11
+	vpblendd	ymm4, ymm4, ymm3, 15            # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_50:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, r11
+	or	rcx, 12
+	vpcmpgtd	xmm3, xmm7, xmm5
+	vpextrb	r14d, xmm3, 0
+	test	r14b, 1
+	mov	qword ptr [rsp + 256], rsi      # 8-byte Spill
+	mov	qword ptr [rsp + 248], rcx      # 8-byte Spill
+	je	.LBB0_52
+# %bb.51:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm3, xmm4, byte ptr [rdi + rcx], 12
+	vpblendd	ymm4, ymm4, ymm3, 15            # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_52:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, r11
+	or	rax, 13
+	vpcmpgtd	ymm7, ymm0, ymm2
+	vpackssdw	ymm3, ymm7, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpbroadcastd	xmm3, xmm3
+	vpacksswb	xmm3, xmm3, xmm3
+	vpextrb	ecx, xmm3, 13
+	test	cl, 1
+	je	.LBB0_54
+# %bb.53:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm3, xmm4, byte ptr [rdi + rax], 13
+	vpblendd	ymm4, ymm4, ymm3, 15            # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_54:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 14
+	vpackssdw	ymm3, ymm7, ymm0
+	vpermq	ymm3, ymm3, 232                 # ymm3 = ymm3[0,2,2,3]
+	vpacksswb	xmm3, xmm3, xmm3
+	vpextrb	ecx, xmm3, 14
+	test	cl, 1
+	mov	qword ptr [rsp + 80], rbx       # 8-byte Spill
+	je	.LBB0_56
+# %bb.55:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm3, xmm4, byte ptr [rdi + rbx], 14
+	vpblendd	ymm4, ymm4, ymm3, 15            # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_56:                               #   in Loop: Header=BB0_26 Depth=1
+	vpslld	ymm10, ymm8, 3
+	mov	rsi, r11
+	or	rsi, 15
+	vpackssdw	ymm3, ymm7, ymm0
+	vpermq	ymm3, ymm3, 232                 # ymm3 = ymm3[0,2,2,3]
+	vpacksswb	xmm3, xmm3, xmm3
+	vpextrb	ecx, xmm3, 15
+	test	cl, 1
+	je	.LBB0_58
+# %bb.57:                               #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm3, xmm4, byte ptr [rdi + rsi], 15
+	vpblendd	ymm4, ymm4, ymm3, 15            # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_58:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r15, r11
+	or	r15, 16
+	vpcmpgtd	xmm3, xmm0, xmm10
+	vmovd	ecx, xmm3
+	test	cl, 1
+	mov	qword ptr [rsp + 64], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 72], rsi       # 8-byte Spill
+	je	.LBB0_60
+# %bb.59:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + r15], 0
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_60:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, r11
+	or	rsi, 17
+	vpcmpgtd	xmm3, xmm0, xmm10
+	vpackssdw	xmm3, xmm3, xmm3
+	vpermq	ymm3, ymm3, 212                 # ymm3 = ymm3[0,1,1,3]
+	vpacksswb	ymm3, ymm3, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 1
+	test	cl, 1
+	je	.LBB0_62
+# %bb.61:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rsi], 1
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_62:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 18
+	vpcmpgtd	xmm3, xmm0, xmm10
+	vpackssdw	xmm3, xmm3, xmm3
+	vpermq	ymm3, ymm3, 212                 # ymm3 = ymm3[0,1,1,3]
+	vpacksswb	ymm3, ymm3, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 2
+	test	cl, 1
+	je	.LBB0_64
+# %bb.63:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 2
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_64:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r15, r11
+	or	r15, 19
+	vpcmpgtd	xmm3, xmm0, xmm10
+	vpackssdw	xmm3, xmm3, xmm3
+	vpermq	ymm3, ymm3, 212                 # ymm3 = ymm3[0,1,1,3]
+	vpacksswb	ymm3, ymm3, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 3
+	test	cl, 1
+	vmovdqa	ymmword ptr [rsp + 864], ymm8   # 32-byte Spill
+	je	.LBB0_66
+# %bb.65:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + r15], 3
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_66:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r13, r11
+	or	r13, 20
+	vpcmpgtd	ymm8, ymm0, ymm10
+	vpackssdw	ymm3, ymm0, ymm8
+	vpacksswb	ymm3, ymm3, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 4
+	test	cl, 1
+	mov	qword ptr [rsp + 56], r13       # 8-byte Spill
+	je	.LBB0_68
+# %bb.67:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + r13], 4
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_68:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r13, r11
+	or	r13, 21
+	vpackssdw	ymm3, ymm0, ymm8
+	vpacksswb	ymm3, ymm3, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 5
+	test	cl, 1
+	mov	qword ptr [rsp + 128], rbx      # 8-byte Spill
+	je	.LBB0_70
+# %bb.69:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + r13], 5
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_70:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	r10, r11
+	or	r10, 22
+	vpackssdw	ymm3, ymm0, ymm8
+	vpacksswb	ymm3, ymm3, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 6
+	test	cl, 1
+	je	.LBB0_72
+# %bb.71:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + r10], 6
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_72:                               #   in Loop: Header=BB0_26 Depth=1
+	vpslld	ymm11, ymm9, 3
+	mov	rbx, r11
+	or	rbx, 23
+	vpackssdw	ymm3, ymm0, ymm8
+	vpacksswb	ymm3, ymm3, ymm0
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 7
+	test	cl, 1
+	mov	qword ptr [rsp + 240], rbx      # 8-byte Spill
+	vmovdqa	ymmword ptr [rsp + 896], ymm9   # 32-byte Spill
+	je	.LBB0_74
+# %bb.73:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 7
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_74:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 24
+	vpcmpgtd	ymm9, ymm0, ymm11
+	vpermq	ymm12, ymm9, 68                 # ymm12 = ymm9[0,1,0,1]
+	vpacksswb	ymm3, ymm0, ymm12
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 8
+	test	cl, 1
+	mov	qword ptr [rsp + 216], rbx      # 8-byte Spill
+	je	.LBB0_76
+# %bb.75:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 8
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_76:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 25
+	vpcmpgtd	xmm3, xmm0, xmm11
+	vpackssdw	xmm3, xmm3, xmm3
+	vpermq	ymm3, ymm3, 212                 # ymm3 = ymm3[0,1,1,3]
+	vpacksswb	ymm3, ymm0, ymm3
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 9
+	test	cl, 1
+	mov	qword ptr [rsp + 208], rbx      # 8-byte Spill
+	je	.LBB0_78
+# %bb.77:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 9
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_78:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 26
+	vpcmpgtd	xmm3, xmm0, xmm11
+	vpackssdw	xmm3, xmm3, xmm3
+	vpermq	ymm3, ymm3, 212                 # ymm3 = ymm3[0,1,1,3]
+	vpacksswb	ymm3, ymm0, ymm3
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 10
+	test	cl, 1
+	mov	qword ptr [rsp + 200], rbx      # 8-byte Spill
+	je	.LBB0_80
+# %bb.79:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 10
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_80:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 27
+	vpcmpgtd	xmm3, xmm0, xmm11
+	vpackssdw	xmm3, xmm3, xmm3
+	vpermq	ymm3, ymm3, 212                 # ymm3 = ymm3[0,1,1,3]
+	vpacksswb	ymm3, ymm0, ymm3
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 11
+	test	cl, 1
+	mov	qword ptr [rsp + 192], rbx      # 8-byte Spill
+	mov	qword ptr [rsp + 144], rdx      # 8-byte Spill
+	mov	qword ptr [rsp + 88], rax       # 8-byte Spill
+	je	.LBB0_82
+# %bb.81:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 11
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_82:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, r11
+	or	rdx, 28
+	vpackssdw	ymm3, ymm0, ymm9
+	vpacksswb	ymm3, ymm0, ymm3
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 12
+	test	cl, 1
+	je	.LBB0_84
+# %bb.83:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rdx], 12
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_84:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 29
+	vpackssdw	ymm3, ymm0, ymm9
+	vpacksswb	ymm3, ymm0, ymm3
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 13
+	test	cl, 1
+	mov	qword ptr [rsp + 176], rbx      # 8-byte Spill
+	je	.LBB0_86
+# %bb.85:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 13
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_86:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 30
+	vpackssdw	ymm3, ymm0, ymm9
+	vpacksswb	ymm3, ymm0, ymm3
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 14
+	test	cl, 1
+	mov	qword ptr [rsp + 168], rbx      # 8-byte Spill
+	je	.LBB0_88
+# %bb.87:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 14
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_88:                               #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, r11
+	or	rbx, 31
+	vpackssdw	ymm3, ymm0, ymm9
+	vpacksswb	ymm3, ymm0, ymm3
+	vextracti128	xmm3, ymm3, 1
+	vpextrb	ecx, xmm3, 15
+	test	cl, 1
+	mov	qword ptr [rsp + 160], rbx      # 8-byte Spill
+	je	.LBB0_90
+# %bb.89:                               #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm3, ymm4, 1
+	vpinsrb	xmm3, xmm3, byte ptr [rdi + rbx], 15
+	vinserti128	ymm4, ymm4, xmm3, 1
+.LBB0_90:                               #   in Loop: Header=BB0_26 Depth=1
+	vpmovzxdq	ymm3, xmm1              # ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+	vmovdqa	ymmword ptr [rsp + 512], ymm3   # 32-byte Spill
+	vpand	ymm15, ymm4, ymmword ptr [rip + .LCPI0_4]
+	vpcmpgtd	xmm3, xmm0, xmm1
+	vmovd	ecx, xmm3
+	test	cl, 1
+	je	.LBB0_92
+# %bb.91:                               #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm3, ymmword ptr [rsp + 512]   # 32-byte Reload
+	vmovq	rcx, xmm3
+	vpextrb	byte ptr [r8 + rcx], xmm15, 0
+.LBB0_92:                               #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm3, xmm0, xmm1
+	vpackssdw	xmm3, xmm3, xmm3
+	vpacksswb	xmm3, xmm3, xmm3
+	vpextrb	ecx, xmm3, 1
+	test	cl, 1
+	je	.LBB0_94
+# %bb.93:                               #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm3, ymmword ptr [rsp + 512]   # 32-byte Reload
+	vpextrq	rcx, xmm3, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 1
+.LBB0_94:                               #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm3, xmm0, xmm1
+	vpackssdw	xmm3, xmm3, xmm3
+	vpacksswb	xmm3, xmm3, xmm3
+	vpextrb	ecx, xmm3, 2
+	test	cl, 1
+	je	.LBB0_96
+# %bb.95:                               #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm3, ymmword ptr [rsp + 512]   # 32-byte Reload
+	vextracti128	xmm3, ymm3, 1
+	vmovq	rcx, xmm3
+	vpextrb	byte ptr [r8 + rcx], xmm15, 2
+.LBB0_96:                               #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm1
+	vpackssdw	xmm1, xmm1, xmm1
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 3
+	test	cl, 1
+	je	.LBB0_98
+# %bb.97:                               #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 512]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 3
+.LBB0_98:                               #   in Loop: Header=BB0_26 Depth=1
+	vpmovzxdq	ymm1, xmm13             # ymm1 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero
+	vmovdqa	ymmword ptr [rsp + 480], ymm1   # 32-byte Spill
+	test	r9b, 1
+	je	.LBB0_100
+# %bb.99:                               #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 480]   # 32-byte Reload
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 4
+.LBB0_100:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm6, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpbroadcastd	xmm1, xmm1
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 5
+	test	cl, 1
+	je	.LBB0_102
+# %bb.101:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 480]   # 32-byte Reload
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 5
+.LBB0_102:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm6, ymm0
+	vpermq	ymm1, ymm1, 232                 # ymm1 = ymm1[0,2,2,3]
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 6
+	test	cl, 1
+	je	.LBB0_104
+# %bb.103:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 480]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 6
+.LBB0_104:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm6, ymm0
+	vpermq	ymm1, ymm1, 232                 # ymm1 = ymm1[0,2,2,3]
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 7
+	test	cl, 1
+	je	.LBB0_106
+# %bb.105:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 480]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 7
+.LBB0_106:                              #   in Loop: Header=BB0_26 Depth=1
+	vpmovzxdq	ymm1, xmm2              # ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+	vmovdqa	ymmword ptr [rsp + 448], ymm1   # 32-byte Spill
+	vpcmpgtd	xmm1, xmm0, xmm2
+	vpextrb	ecx, xmm1, 0
+	test	cl, 1
+	je	.LBB0_108
+# %bb.107:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 448]   # 32-byte Reload
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 8
+.LBB0_108:                              #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm2
+	vpackssdw	xmm1, xmm1, xmm1
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	je	.LBB0_110
+# %bb.109:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 448]   # 32-byte Reload
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 9
+.LBB0_110:                              #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm2
+	vpackssdw	xmm1, xmm1, xmm1
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 10
+	test	cl, 1
+	je	.LBB0_112
+# %bb.111:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 448]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 10
+.LBB0_112:                              #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm2
+	vpackssdw	xmm1, xmm1, xmm1
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 11
+	test	cl, 1
+	je	.LBB0_114
+# %bb.113:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 448]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 11
+.LBB0_114:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	qword ptr [rsp + 136], rsi      # 8-byte Spill
+	vpmovzxdq	ymm1, xmm5              # ymm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+	vmovdqa	ymmword ptr [rsp + 416], ymm1   # 32-byte Spill
+	test	r14b, 1
+	je	.LBB0_116
+# %bb.115:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 416]   # 32-byte Reload
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 12
+.LBB0_116:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm7, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpbroadcastd	xmm1, xmm1
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 13
+	test	cl, 1
+	mov	r9, qword ptr [rsp + 152]       # 8-byte Reload
+	mov	rsi, qword ptr [rsp + 296]      # 8-byte Reload
+	mov	r14, qword ptr [rsp + 104]      # 8-byte Reload
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	je	.LBB0_118
+# %bb.117:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 416]   # 32-byte Reload
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 13
+.LBB0_118:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm7, ymm0
+	vpermq	ymm1, ymm1, 232                 # ymm1 = ymm1[0,2,2,3]
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 14
+	test	cl, 1
+	je	.LBB0_120
+# %bb.119:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 416]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 14
+.LBB0_120:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm7, ymm0
+	vpermq	ymm1, ymm1, 232                 # ymm1 = ymm1[0,2,2,3]
+	vpacksswb	xmm1, xmm1, xmm1
+	vpextrb	ecx, xmm1, 15
+	test	cl, 1
+	je	.LBB0_122
+# %bb.121:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 416]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm15, 15
+.LBB0_122:                              #   in Loop: Header=BB0_26 Depth=1
+	vpmovzxdq	ymm1, xmm10             # ymm1 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+	vmovdqa	ymmword ptr [rsp + 384], ymm1   # 32-byte Spill
+	vpcmpgtd	xmm1, xmm0, xmm10
+	vmovd	ecx, xmm1
+	test	cl, 1
+	je	.LBB0_124
+# %bb.123:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 384]   # 32-byte Reload
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 0
+.LBB0_124:                              #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm10
+	vpackssdw	xmm1, xmm1, xmm1
+	vpermq	ymm1, ymm1, 212                 # ymm1 = ymm1[0,1,1,3]
+	vpacksswb	ymm1, ymm1, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 1
+	test	cl, 1
+	je	.LBB0_126
+# %bb.125:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 384]   # 32-byte Reload
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 1
+.LBB0_126:                              #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm10
+	vpackssdw	xmm1, xmm1, xmm1
+	vpermq	ymm1, ymm1, 212                 # ymm1 = ymm1[0,1,1,3]
+	vpacksswb	ymm1, ymm1, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 2
+	test	cl, 1
+	je	.LBB0_128
+# %bb.127:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 384]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 2
+.LBB0_128:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpcmpgtd	xmm2, xmm0, xmm10
+	vpackssdw	xmm2, xmm2, xmm2
+	vpermq	ymm2, ymm2, 212                 # ymm2 = ymm2[0,1,1,3]
+	vpacksswb	ymm2, ymm2, ymm0
+	vextracti128	xmm2, ymm2, 1
+	vpextrb	ecx, xmm2, 3
+	test	cl, 1
+	je	.LBB0_130
+# %bb.129:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm2, ymmword ptr [rsp + 384]   # 32-byte Reload
+	vextracti128	xmm2, ymm2, 1
+	vpextrq	rcx, xmm2, 1
+	vextracti128	xmm2, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm2, 3
+.LBB0_130:                              #   in Loop: Header=BB0_26 Depth=1
+	vpmovzxdq	ymm1, xmm1              # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+	vmovdqa	ymmword ptr [rsp + 352], ymm1   # 32-byte Spill
+	vpackssdw	ymm1, ymm0, ymm8
+	vpacksswb	ymm1, ymm1, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 4
+	test	cl, 1
+	je	.LBB0_132
+# %bb.131:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 352]   # 32-byte Reload
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 4
+.LBB0_132:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm0, ymm8
+	vpacksswb	ymm1, ymm1, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 5
+	test	cl, 1
+	je	.LBB0_134
+# %bb.133:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 352]   # 32-byte Reload
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 5
+.LBB0_134:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm0, ymm8
+	vpacksswb	ymm1, ymm1, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 6
+	test	cl, 1
+	je	.LBB0_136
+# %bb.135:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 352]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 6
+.LBB0_136:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm0, ymm8
+	vpacksswb	ymm1, ymm1, ymm0
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 7
+	test	cl, 1
+	je	.LBB0_138
+# %bb.137:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 352]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 7
+.LBB0_138:                              #   in Loop: Header=BB0_26 Depth=1
+	vpmovzxdq	ymm1, xmm11             # ymm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
+	vmovdqa	ymmword ptr [rsp + 320], ymm1   # 32-byte Spill
+	vpacksswb	ymm1, ymm0, ymm12
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 8
+	test	cl, 1
+	je	.LBB0_140
+# %bb.139:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 320]   # 32-byte Reload
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 8
+.LBB0_140:                              #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm11
+	vpackssdw	xmm1, xmm1, xmm1
+	vpermq	ymm1, ymm1, 212                 # ymm1 = ymm1[0,1,1,3]
+	vpacksswb	ymm1, ymm0, ymm1
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	je	.LBB0_142
+# %bb.141:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 320]   # 32-byte Reload
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+.LBB0_142:                              #   in Loop: Header=BB0_26 Depth=1
+	vpcmpgtd	xmm1, xmm0, xmm11
+	vpackssdw	xmm1, xmm1, xmm1
+	vpermq	ymm1, ymm1, 212                 # ymm1 = ymm1[0,1,1,3]
+	vpacksswb	ymm1, ymm0, ymm1
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 10
+	test	cl, 1
+	je	.LBB0_144
+# %bb.143:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 320]   # 32-byte Reload
+	vextracti128	xmm1, ymm1, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+.LBB0_144:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpcmpgtd	xmm4, xmm0, xmm11
+	vpackssdw	xmm4, xmm4, xmm4
+	vpermq	ymm4, ymm4, 212                 # ymm4 = ymm4[0,1,1,3]
+	vpacksswb	ymm4, ymm0, ymm4
+	vextracti128	xmm4, ymm4, 1
+	vpextrb	ecx, xmm4, 11
+	test	cl, 1
+	je	.LBB0_146
+# %bb.145:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm2, ymmword ptr [rsp + 320]   # 32-byte Reload
+	vextracti128	xmm4, ymm2, 1
+	vpextrq	rcx, xmm4, 1
+	vextracti128	xmm4, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm4, 11
+.LBB0_146:                              #   in Loop: Header=BB0_26 Depth=1
+	vpmovzxdq	ymm4, xmm1              # ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+	vpackssdw	ymm1, ymm0, ymm9
+	vpacksswb	ymm1, ymm0, ymm1
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 12
+	test	cl, 1
+	je	.LBB0_148
+# %bb.147:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm4
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+.LBB0_148:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm0, ymm9
+	vpacksswb	ymm1, ymm0, ymm1
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 13
+	test	cl, 1
+	je	.LBB0_150
+# %bb.149:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm4, 1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+.LBB0_150:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm0, ymm9
+	vpacksswb	ymm1, ymm0, ymm1
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 14
+	test	cl, 1
+	je	.LBB0_152
+# %bb.151:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+.LBB0_152:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm0, ymm9
+	vpacksswb	ymm1, ymm0, ymm1
+	vextracti128	xmm1, ymm1, 1
+	vpextrb	ecx, xmm1, 15
+	test	cl, 1
+	je	.LBB0_154
+# %bb.153:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+.LBB0_154:                              #   in Loop: Header=BB0_26 Depth=1
+	vpackssdw	ymm1, ymm6, ymm8
+	vpermq	ymm1, ymm1, 216                 # ymm1 = ymm1[0,2,1,3]
+	vpackssdw	ymm5, ymm7, ymm9
+	vpermq	ymm5, ymm5, 216                 # ymm5 = ymm5[0,2,1,3]
+	vpacksswb	ymm1, ymm1, ymm5
+	vmovdqa	ymm2, ymmword ptr [rsp + 768]   # 32-byte Reload
+	vpor	ymm15, ymm2, ymmword ptr [rsp + 512] # 32-byte Folded Reload
+	vpor	ymm5, ymm2, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vpor	ymm10, ymm2, ymmword ptr [rsp + 384] # 32-byte Folded Reload
+	vpor	ymm9, ymm2, ymmword ptr [rsp + 352] # 32-byte Folded Reload
+	vpor	ymm12, ymm2, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vpor	ymm11, ymm2, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vpor	ymm8, ymm2, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpor	ymm7, ymm4, ymm2
+	vperm2i128	ymm6, ymm8, ymm7, 49    # ymm6 = ymm8[2,3],ymm7[2,3]
+	vinserti128	ymm13, ymm8, xmm7, 1
+	vshufps	ymm6, ymm13, ymm6, 136          # ymm6 = ymm13[0,2],ymm6[0,2],ymm13[4,6],ymm6[4,6]
+	vperm2i128	ymm13, ymm12, ymm11, 49 # ymm13 = ymm12[2,3],ymm11[2,3]
+	vinserti128	ymm14, ymm12, xmm11, 1
+	vshufps	ymm13, ymm14, ymm13, 136        # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6]
+	vperm2i128	ymm14, ymm10, ymm9, 49  # ymm14 = ymm10[2,3],ymm9[2,3]
+	vinserti128	ymm2, ymm10, xmm9, 1
+	vshufps	ymm2, ymm2, ymm14, 136          # ymm2 = ymm2[0,2],ymm14[0,2],ymm2[4,6],ymm14[4,6]
+	vperm2i128	ymm14, ymm15, ymm5, 49  # ymm14 = ymm15[2,3],ymm5[2,3]
+	vinserti128	ymm3, ymm15, xmm5, 1
+	vshufps	ymm3, ymm3, ymm14, 136          # ymm3 = ymm3[0,2],ymm14[0,2],ymm3[4,6],ymm14[4,6]
+	vpcmpgtd	ymm3, ymm0, ymm3
+	vpcmpgtd	ymm2, ymm0, ymm2
+	vpackssdw	ymm2, ymm3, ymm2
+	vpcmpgtd	ymm3, ymm0, ymm13
+	vpcmpgtd	ymm6, ymm0, ymm6
+	vpackssdw	ymm3, ymm3, ymm6
+	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
+	vpermq	ymm3, ymm3, 216                 # ymm3 = ymm3[0,2,1,3]
+	vpacksswb	ymm2, ymm2, ymm3
+	vpand	ymm6, ymm2, ymm1
+	vmovd	ecx, xmm6
+                                        # implicit-def: $ymm14
+	test	cl, 1
+	je	.LBB0_155
+# %bb.660:                              #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm14, byte ptr [rdi + r11]
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	jne	.LBB0_661
+.LBB0_156:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rbx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	je	.LBB0_157
+.LBB0_662:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 2
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	jne	.LBB0_663
+.LBB0_158:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	je	.LBB0_159
+.LBB0_664:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 4
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	jne	.LBB0_665
+.LBB0_160:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, qword ptr [rsp + 232]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	je	.LBB0_161
+.LBB0_666:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 6
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	jne	.LBB0_667
+.LBB0_162:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	je	.LBB0_163
+.LBB0_668:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 8
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_669
+.LBB0_164:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	je	.LBB0_165
+.LBB0_670:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 10
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	jne	.LBB0_671
+.LBB0_166:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	je	.LBB0_167
+.LBB0_672:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 12
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	jne	.LBB0_673
+.LBB0_168:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	je	.LBB0_169
+.LBB0_674:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 14
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	jne	.LBB0_170
+	jmp	.LBB0_171
+	.p2align	4, 0x90
+.LBB0_155:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	je	.LBB0_156
+.LBB0_661:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r9], 1
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rbx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	jne	.LBB0_662
+.LBB0_157:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	je	.LBB0_158
+.LBB0_663:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 272]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 3
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	jne	.LBB0_664
+.LBB0_159:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	je	.LBB0_160
+.LBB0_665:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r14], 5
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rsi, qword ptr [rsp + 232]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	jne	.LBB0_666
+.LBB0_161:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	je	.LBB0_162
+.LBB0_667:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 7
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	jne	.LBB0_668
+.LBB0_163:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_164
+.LBB0_669:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rbx], 9
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	jne	.LBB0_670
+.LBB0_165:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	je	.LBB0_166
+.LBB0_671:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 11
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	jne	.LBB0_672
+.LBB0_167:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	je	.LBB0_168
+.LBB0_673:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 13
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	jne	.LBB0_674
+.LBB0_169:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	je	.LBB0_171
+.LBB0_170:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 15
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_171:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 208]      # 8-byte Reload
+	vextracti128	xmm13, ymm6, 1
+	vmovd	eax, xmm13
+	mov	dword ptr [rsp + 44], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_172
+# %bb.675:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 0
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 1
+	mov	dword ptr [rsp + 40], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_676
+.LBB0_173:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 2
+	mov	dword ptr [rsp + 36], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_174
+.LBB0_677:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 2
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 3
+	mov	dword ptr [rsp + 32], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_678
+.LBB0_175:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 4
+	mov	dword ptr [rsp + 28], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_176
+.LBB0_679:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 4
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_680
+.LBB0_177:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 6
+	mov	dword ptr [rsp + 20], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_178
+.LBB0_681:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + r10], 6
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 7
+	mov	dword ptr [rsp + 316], eax      # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_682
+.LBB0_179:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpextrb	ebx, xmm13, 8
+	test	bl, 1
+	je	.LBB0_181
+.LBB0_180:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 8
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_181:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r9d, xmm13, 9
+	test	r9b, 1
+	mov	qword ptr [rsp + 280], r13      # 8-byte Spill
+	mov	qword ptr [rsp + 112], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 184], rdx      # 8-byte Spill
+	je	.LBB0_183
+# %bb.182:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rcx], 9
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_183:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	mov	rcx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpextrb	r13d, xmm13, 10
+	test	r13b, 1
+	je	.LBB0_184
+# %bb.683:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 10
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 11
+	test	al, 1
+	mov	qword ptr [rsp + 120], r15      # 8-byte Spill
+	jne	.LBB0_684
+.LBB0_185:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r15d, xmm13, 12
+	test	r15b, 1
+	mov	qword ptr [rsp + 304], r11      # 8-byte Spill
+	je	.LBB0_186
+.LBB0_685:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rcx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rcx], 12
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	edx, xmm13, 13
+	test	dl, 1
+	jne	.LBB0_686
+.LBB0_187:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	esi, xmm13, 14
+	test	sil, 1
+	je	.LBB0_188
+.LBB0_687:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rcx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rcx], 14
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	r14d, xmm13, 15
+	test	r14b, 1
+	jne	.LBB0_189
+	jmp	.LBB0_190
+	.p2align	4, 0x90
+.LBB0_172:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 1
+	mov	dword ptr [rsp + 40], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_173
+.LBB0_676:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 1
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 2
+	mov	dword ptr [rsp + 36], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_677
+.LBB0_174:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 3
+	mov	dword ptr [rsp + 32], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_175
+.LBB0_678:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + r15], 3
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 4
+	mov	dword ptr [rsp + 28], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_679
+.LBB0_176:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_177
+.LBB0_680:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + r13], 5
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	eax, xmm13, 6
+	mov	dword ptr [rsp + 20], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_681
+.LBB0_178:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 7
+	mov	dword ptr [rsp + 316], eax      # 4-byte Spill
+	test	al, 1
+	je	.LBB0_179
+.LBB0_682:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 7
+	vinserti128	ymm14, ymm14, xmm1, 1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpextrb	ebx, xmm13, 8
+	test	bl, 1
+	jne	.LBB0_180
+	jmp	.LBB0_181
+	.p2align	4, 0x90
+.LBB0_184:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm13, 11
+	test	al, 1
+	mov	qword ptr [rsp + 120], r15      # 8-byte Spill
+	je	.LBB0_185
+.LBB0_684:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rcx], 11
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	r15d, xmm13, 12
+	test	r15b, 1
+	mov	qword ptr [rsp + 304], r11      # 8-byte Spill
+	jne	.LBB0_685
+.LBB0_186:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	edx, xmm13, 13
+	test	dl, 1
+	je	.LBB0_187
+.LBB0_686:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rcx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rcx], 13
+	vinserti128	ymm14, ymm14, xmm1, 1
+	vpextrb	esi, xmm13, 14
+	test	sil, 1
+	jne	.LBB0_687
+.LBB0_188:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r14d, xmm13, 15
+	test	r14b, 1
+	je	.LBB0_190
+.LBB0_189:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rcx], 15
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_190:                              #   in Loop: Header=BB0_26 Depth=1
+	vpsrlw	ymm1, ymm14, 1
+	vpand	ymm14, ymm1, ymmword ptr [rip + .LCPI0_4]
+	vmovd	r10d, xmm6
+	test	r10b, 1
+	je	.LBB0_191
+# %bb.688:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm15
+	vpextrb	byte ptr [r8 + rcx], xmm14, 0
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	jne	.LBB0_689
+.LBB0_192:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	je	.LBB0_193
+.LBB0_690:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 2
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	jne	.LBB0_691
+.LBB0_194:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	je	.LBB0_195
+.LBB0_692:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm5
+	vpextrb	byte ptr [r8 + rcx], xmm14, 4
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	jne	.LBB0_693
+.LBB0_196:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	je	.LBB0_197
+.LBB0_694:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 6
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	jne	.LBB0_695
+.LBB0_198:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	je	.LBB0_199
+.LBB0_696:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm12
+	vpextrb	byte ptr [r8 + rcx], xmm14, 8
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_697
+.LBB0_200:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	je	.LBB0_201
+.LBB0_698:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 10
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	jne	.LBB0_699
+.LBB0_202:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	je	.LBB0_203
+.LBB0_700:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm11
+	vpextrb	byte ptr [r8 + rcx], xmm14, 12
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	jne	.LBB0_701
+.LBB0_204:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	je	.LBB0_205
+.LBB0_702:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vmovq	rcx, xmm1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 14
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	jne	.LBB0_703
+.LBB0_206:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	je	.LBB0_207
+.LBB0_704:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm10
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 0
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	jne	.LBB0_705
+.LBB0_208:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	je	.LBB0_209
+.LBB0_706:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 2
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	jne	.LBB0_707
+.LBB0_210:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	je	.LBB0_211
+.LBB0_708:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm9
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 4
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	jne	.LBB0_709
+.LBB0_212:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	je	.LBB0_213
+.LBB0_710:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 6
+	test	byte ptr [rsp + 316], 1         # 1-byte Folded Reload
+	jne	.LBB0_711
+.LBB0_214:                              #   in Loop: Header=BB0_26 Depth=1
+	test	bl, 1
+	je	.LBB0_215
+.LBB0_712:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm8
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 8
+	test	r9b, 1
+	mov	r10, qword ptr [rsp + 224]      # 8-byte Reload
+	mov	r11, qword ptr [rsp + 144]      # 8-byte Reload
+	jne	.LBB0_713
+.LBB0_216:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r13b, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	je	.LBB0_217
+.LBB0_714:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+	test	al, 1
+	mov	r9, qword ptr [rsp + 288]       # 8-byte Reload
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	jne	.LBB0_715
+.LBB0_218:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r15b, 1
+	je	.LBB0_219
+.LBB0_716:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm7
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+	test	dl, 1
+	mov	r13, qword ptr [rsp + 136]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 128]      # 8-byte Reload
+	jne	.LBB0_717
+.LBB0_220:                              #   in Loop: Header=BB0_26 Depth=1
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	je	.LBB0_221
+.LBB0_718:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+	test	r14b, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	jne	.LBB0_222
+	jmp	.LBB0_223
+	.p2align	4, 0x90
+.LBB0_191:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	je	.LBB0_192
+.LBB0_689:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm15, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 1
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	jne	.LBB0_690
+.LBB0_193:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	je	.LBB0_194
+.LBB0_691:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 3
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	jne	.LBB0_692
+.LBB0_195:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	je	.LBB0_196
+.LBB0_693:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm5, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 5
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	jne	.LBB0_694
+.LBB0_197:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	je	.LBB0_198
+.LBB0_695:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 7
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	jne	.LBB0_696
+.LBB0_199:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_200
+.LBB0_697:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm12, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 9
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	jne	.LBB0_698
+.LBB0_201:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	je	.LBB0_202
+.LBB0_699:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 11
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	jne	.LBB0_700
+.LBB0_203:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	je	.LBB0_204
+.LBB0_701:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm11, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 13
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	jne	.LBB0_702
+.LBB0_205:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	je	.LBB0_206
+.LBB0_703:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpextrq	rcx, xmm1, 1
+	vpextrb	byte ptr [r8 + rcx], xmm14, 15
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	jne	.LBB0_704
+.LBB0_207:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	je	.LBB0_208
+.LBB0_705:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm10, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	jne	.LBB0_706
+.LBB0_209:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	je	.LBB0_210
+.LBB0_707:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 3
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	jne	.LBB0_708
+.LBB0_211:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	je	.LBB0_212
+.LBB0_709:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm9, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 5
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	jne	.LBB0_710
+.LBB0_213:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 316], 1         # 1-byte Folded Reload
+	je	.LBB0_214
+.LBB0_711:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 7
+	test	bl, 1
+	jne	.LBB0_712
+.LBB0_215:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r9b, 1
+	mov	r10, qword ptr [rsp + 224]      # 8-byte Reload
+	mov	r11, qword ptr [rsp + 144]      # 8-byte Reload
+	je	.LBB0_216
+.LBB0_713:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm8, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+	test	r13b, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	jne	.LBB0_714
+.LBB0_217:                              #   in Loop: Header=BB0_26 Depth=1
+	test	al, 1
+	mov	r9, qword ptr [rsp + 288]       # 8-byte Reload
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	je	.LBB0_218
+.LBB0_715:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 11
+	test	r15b, 1
+	jne	.LBB0_716
+.LBB0_219:                              #   in Loop: Header=BB0_26 Depth=1
+	test	dl, 1
+	mov	r13, qword ptr [rsp + 136]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 128]      # 8-byte Reload
+	je	.LBB0_220
+.LBB0_717:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm7, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	jne	.LBB0_718
+.LBB0_221:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r14b, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	je	.LBB0_223
+.LBB0_222:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+.LBB0_223:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 736]   # 32-byte Reload
+	vpor	ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload
+	vpor	ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vpor	ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload
+	vpor	ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload
+	vpor	ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vpor	ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vpor	ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpor	ymm7, ymm4, ymm1
+	vperm2i128	ymm1, ymm8, ymm7, 49    # ymm1 = ymm8[2,3],ymm7[2,3]
+	vinserti128	ymm2, ymm8, xmm7, 1
+	vshufps	ymm1, ymm2, ymm1, 136           # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
+	vperm2i128	ymm2, ymm12, ymm11, 49  # ymm2 = ymm12[2,3],ymm11[2,3]
+	vinserti128	ymm3, ymm12, xmm11, 1
+	vshufps	ymm2, ymm3, ymm2, 136           # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+	vperm2i128	ymm3, ymm10, ymm9, 49   # ymm3 = ymm10[2,3],ymm9[2,3]
+	vinserti128	ymm13, ymm10, xmm9, 1
+	vshufps	ymm3, ymm13, ymm3, 136          # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6]
+	vperm2i128	ymm13, ymm15, ymm5, 49  # ymm13 = ymm15[2,3],ymm5[2,3]
+	vinserti128	ymm14, ymm15, xmm5, 1
+	vshufps	ymm13, ymm14, ymm13, 136        # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6]
+	vpcmpgtd	ymm13, ymm0, ymm13
+	vpcmpgtd	ymm3, ymm0, ymm3
+	vpackssdw	ymm3, ymm13, ymm3
+	vpcmpgtd	ymm2, ymm0, ymm2
+	vpcmpgtd	ymm1, ymm0, ymm1
+	vpackssdw	ymm1, ymm2, ymm1
+	vpermq	ymm2, ymm3, 216                 # ymm2 = ymm3[0,2,1,3]
+	vpermq	ymm1, ymm1, 216                 # ymm1 = ymm1[0,2,1,3]
+	vpacksswb	ymm1, ymm2, ymm1
+	vpand	ymm6, ymm1, ymm6
+	vmovd	ecx, xmm6
+                                        # implicit-def: $ymm14
+	test	cl, 1
+	je	.LBB0_224
+# %bb.719:                              #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm14, byte ptr [rdi + rdx]
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	jne	.LBB0_720
+.LBB0_225:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	je	.LBB0_227
+.LBB0_226:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rbx], 2
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_227:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, qword ptr [rsp + 96]       # 8-byte Reload
+	mov	rbx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	je	.LBB0_228
+# %bb.721:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 272]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 3
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	jne	.LBB0_722
+.LBB0_229:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	je	.LBB0_230
+.LBB0_723:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 5
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	jne	.LBB0_724
+.LBB0_231:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	je	.LBB0_232
+.LBB0_725:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 7
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	jne	.LBB0_726
+.LBB0_233:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_234
+.LBB0_727:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r10], 9
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	jne	.LBB0_728
+.LBB0_235:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	je	.LBB0_236
+.LBB0_729:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 11
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	jne	.LBB0_730
+.LBB0_237:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	je	.LBB0_239
+.LBB0_238:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 13
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_239:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	je	.LBB0_241
+# %bb.240:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 14
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_241:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	je	.LBB0_243
+# %bb.242:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rbx], 15
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_243:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm6, 1
+	vmovd	eax, xmm1
+	mov	dword ptr [rsp + 44], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_245
+# %bb.244:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rdx], 0
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_245:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpextrb	eax, xmm1, 1
+	mov	dword ptr [rsp + 40], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_247
+# %bb.246:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r13], 1
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_247:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	rsi, qword ptr [rsp + 112]      # 8-byte Reload
+	vpextrb	eax, xmm1, 2
+	mov	dword ptr [rsp + 36], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_249
+# %bb.248:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r15], 2
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_249:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpextrb	ebx, xmm1, 3
+	mov	dword ptr [rsp + 32], ebx       # 4-byte Spill
+	test	bl, 1
+	je	.LBB0_250
+# %bb.731:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 3
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 4
+	mov	dword ptr [rsp + 28], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_732
+.LBB0_251:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_252
+.LBB0_733:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rdx], 5
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 6
+	mov	dword ptr [rsp + 20], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_734
+.LBB0_253:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	je	.LBB0_254
+.LBB0_735:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 7
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	jne	.LBB0_736
+.LBB0_255:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	je	.LBB0_256
+.LBB0_737:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 9
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	jne	.LBB0_738
+.LBB0_257:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	je	.LBB0_258
+.LBB0_739:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 11
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	jne	.LBB0_740
+.LBB0_259:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	je	.LBB0_260
+.LBB0_741:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 13
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	jne	.LBB0_742
+.LBB0_261:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	je	.LBB0_263
+.LBB0_262:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rbx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rbx], 15
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_263:                              #   in Loop: Header=BB0_26 Depth=1
+	vpsrlw	ymm1, ymm14, 2
+	vpand	ymm14, ymm1, ymmword ptr [rip + .LCPI0_4]
+	vmovd	r15d, xmm6
+	test	r15b, 1
+	je	.LBB0_264
+# %bb.743:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm15
+	vpextrb	byte ptr [r8 + rbx], xmm14, 0
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	jne	.LBB0_744
+.LBB0_265:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	je	.LBB0_266
+.LBB0_745:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 2
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	jne	.LBB0_746
+.LBB0_267:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	je	.LBB0_268
+.LBB0_747:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm5
+	vpextrb	byte ptr [r8 + rbx], xmm14, 4
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	jne	.LBB0_748
+.LBB0_269:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	je	.LBB0_270
+.LBB0_749:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 6
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	jne	.LBB0_750
+.LBB0_271:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	je	.LBB0_272
+.LBB0_751:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm12
+	vpextrb	byte ptr [r8 + rbx], xmm14, 8
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	jne	.LBB0_752
+.LBB0_273:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	je	.LBB0_274
+.LBB0_753:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 10
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	jne	.LBB0_754
+.LBB0_275:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	je	.LBB0_276
+.LBB0_755:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm11
+	vpextrb	byte ptr [r8 + rbx], xmm14, 12
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	jne	.LBB0_756
+.LBB0_277:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	je	.LBB0_278
+.LBB0_757:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 14
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	jne	.LBB0_758
+.LBB0_279:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	je	.LBB0_280
+.LBB0_759:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm10
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 0
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	jne	.LBB0_760
+.LBB0_281:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	je	.LBB0_282
+.LBB0_761:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 2
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	jne	.LBB0_762
+.LBB0_283:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	je	.LBB0_284
+.LBB0_763:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm9
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 4
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	jne	.LBB0_764
+.LBB0_285:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	je	.LBB0_286
+.LBB0_765:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 6
+	test	r9b, 1
+	jne	.LBB0_766
+.LBB0_287:                              #   in Loop: Header=BB0_26 Depth=1
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	je	.LBB0_288
+.LBB0_767:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rdx, xmm8
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rdx], xmm1, 8
+	test	cl, 1
+	jne	.LBB0_768
+.LBB0_289:                              #   in Loop: Header=BB0_26 Depth=1
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	je	.LBB0_290
+.LBB0_769:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	jne	.LBB0_770
+.LBB0_291:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r13b, 1
+	je	.LBB0_292
+.LBB0_771:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm7
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	jne	.LBB0_772
+.LBB0_293:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r11b, 1
+	je	.LBB0_294
+.LBB0_773:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	jne	.LBB0_295
+	jmp	.LBB0_296
+	.p2align	4, 0x90
+.LBB0_224:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	je	.LBB0_225
+.LBB0_720:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 1
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	jne	.LBB0_226
+	jmp	.LBB0_227
+	.p2align	4, 0x90
+.LBB0_228:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	je	.LBB0_229
+.LBB0_722:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 4
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	jne	.LBB0_723
+.LBB0_230:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	je	.LBB0_231
+.LBB0_724:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r9], 6
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	jne	.LBB0_725
+.LBB0_232:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	je	.LBB0_233
+.LBB0_726:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 8
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_727
+.LBB0_234:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	je	.LBB0_235
+.LBB0_728:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r11], 10
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	jne	.LBB0_729
+.LBB0_236:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	je	.LBB0_237
+.LBB0_730:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 12
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	jne	.LBB0_238
+	jmp	.LBB0_239
+	.p2align	4, 0x90
+.LBB0_250:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 4
+	mov	dword ptr [rsp + 28], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_251
+.LBB0_732:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rcx], 4
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_733
+.LBB0_252:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 6
+	mov	dword ptr [rsp + 20], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_253
+.LBB0_734:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rsi], 6
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	jne	.LBB0_735
+.LBB0_254:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	je	.LBB0_255
+.LBB0_736:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 8
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	jne	.LBB0_737
+.LBB0_256:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	je	.LBB0_257
+.LBB0_738:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 10
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	jne	.LBB0_739
+.LBB0_258:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	je	.LBB0_259
+.LBB0_740:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 12
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	jne	.LBB0_741
+.LBB0_260:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	je	.LBB0_261
+.LBB0_742:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 14
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	jne	.LBB0_262
+	jmp	.LBB0_263
+	.p2align	4, 0x90
+.LBB0_264:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	je	.LBB0_265
+.LBB0_744:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm15, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	jne	.LBB0_745
+.LBB0_266:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	je	.LBB0_267
+.LBB0_746:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 3
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	jne	.LBB0_747
+.LBB0_268:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	je	.LBB0_269
+.LBB0_748:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm5, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 5
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	jne	.LBB0_749
+.LBB0_270:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	je	.LBB0_271
+.LBB0_750:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 7
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	jne	.LBB0_751
+.LBB0_272:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	je	.LBB0_273
+.LBB0_752:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm12, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 9
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	jne	.LBB0_753
+.LBB0_274:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	je	.LBB0_275
+.LBB0_754:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 11
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	jne	.LBB0_755
+.LBB0_276:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	je	.LBB0_277
+.LBB0_756:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm11, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 13
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	jne	.LBB0_757
+.LBB0_278:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	je	.LBB0_279
+.LBB0_758:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 15
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	jne	.LBB0_759
+.LBB0_280:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	je	.LBB0_281
+.LBB0_760:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm10, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	jne	.LBB0_761
+.LBB0_282:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	je	.LBB0_283
+.LBB0_762:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 3
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	jne	.LBB0_763
+.LBB0_284:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	je	.LBB0_285
+.LBB0_764:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm9, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 5
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	jne	.LBB0_765
+.LBB0_286:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r9b, 1
+	je	.LBB0_287
+.LBB0_766:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 7
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	jne	.LBB0_767
+.LBB0_288:                              #   in Loop: Header=BB0_26 Depth=1
+	test	cl, 1
+	je	.LBB0_289
+.LBB0_768:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm8, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	jne	.LBB0_769
+.LBB0_290:                              #   in Loop: Header=BB0_26 Depth=1
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	je	.LBB0_291
+.LBB0_770:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 11
+	test	r13b, 1
+	jne	.LBB0_771
+.LBB0_292:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	je	.LBB0_293
+.LBB0_772:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm7, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+	test	r11b, 1
+	jne	.LBB0_773
+.LBB0_294:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	je	.LBB0_296
+.LBB0_295:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+.LBB0_296:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 704]   # 32-byte Reload
+	vpor	ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload
+	vpor	ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vpor	ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload
+	vpor	ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload
+	vpor	ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vpor	ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vpor	ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpor	ymm7, ymm4, ymm1
+	vperm2i128	ymm1, ymm8, ymm7, 49    # ymm1 = ymm8[2,3],ymm7[2,3]
+	vinserti128	ymm2, ymm8, xmm7, 1
+	vshufps	ymm1, ymm2, ymm1, 136           # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
+	vperm2i128	ymm2, ymm12, ymm11, 49  # ymm2 = ymm12[2,3],ymm11[2,3]
+	vinserti128	ymm3, ymm12, xmm11, 1
+	vshufps	ymm2, ymm3, ymm2, 136           # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+	vperm2i128	ymm3, ymm10, ymm9, 49   # ymm3 = ymm10[2,3],ymm9[2,3]
+	vinserti128	ymm13, ymm10, xmm9, 1
+	vshufps	ymm3, ymm13, ymm3, 136          # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6]
+	vperm2i128	ymm13, ymm15, ymm5, 49  # ymm13 = ymm15[2,3],ymm5[2,3]
+	vinserti128	ymm14, ymm15, xmm5, 1
+	vshufps	ymm13, ymm14, ymm13, 136        # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6]
+	vpcmpgtd	ymm13, ymm0, ymm13
+	vpcmpgtd	ymm3, ymm0, ymm3
+	vpackssdw	ymm3, ymm13, ymm3
+	vpcmpgtd	ymm2, ymm0, ymm2
+	vpcmpgtd	ymm1, ymm0, ymm1
+	vpackssdw	ymm1, ymm2, ymm1
+	vpermq	ymm2, ymm3, 216                 # ymm2 = ymm3[0,2,1,3]
+	vpermq	ymm1, ymm1, 216                 # ymm1 = ymm1[0,2,1,3]
+	vpacksswb	ymm1, ymm2, ymm1
+	vpand	ymm6, ymm1, ymm6
+	vmovd	ecx, xmm6
+                                        # implicit-def: $ymm14
+	test	cl, 1
+	je	.LBB0_297
+# %bb.774:                              #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm14, byte ptr [rdi + rdx]
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	jne	.LBB0_775
+.LBB0_298:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	je	.LBB0_300
+.LBB0_299:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rbx], 2
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_300:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, qword ptr [rsp + 96]       # 8-byte Reload
+	mov	r10, qword ptr [rsp + 72]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	je	.LBB0_301
+# %bb.776:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 272]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 3
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	jne	.LBB0_777
+.LBB0_302:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	je	.LBB0_303
+.LBB0_778:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 5
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	jne	.LBB0_779
+.LBB0_304:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	je	.LBB0_305
+.LBB0_780:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r9], 7
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	jne	.LBB0_781
+.LBB0_306:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_308
+.LBB0_307:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r15], 9
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_308:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	mov	rsi, qword ptr [rsp + 136]      # 8-byte Reload
+	mov	rbx, qword ptr [rsp + 128]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 120]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	je	.LBB0_309
+# %bb.782:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 10
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	jne	.LBB0_783
+.LBB0_310:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	je	.LBB0_311
+.LBB0_784:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 12
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	jne	.LBB0_312
+	jmp	.LBB0_313
+	.p2align	4, 0x90
+.LBB0_297:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	je	.LBB0_298
+.LBB0_775:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 1
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	jne	.LBB0_299
+	jmp	.LBB0_300
+	.p2align	4, 0x90
+.LBB0_301:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	je	.LBB0_302
+.LBB0_777:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 4
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	jne	.LBB0_778
+.LBB0_303:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	je	.LBB0_304
+.LBB0_779:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 6
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	jne	.LBB0_780
+.LBB0_305:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	je	.LBB0_306
+.LBB0_781:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 8
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_307
+	jmp	.LBB0_308
+	.p2align	4, 0x90
+.LBB0_309:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	je	.LBB0_310
+.LBB0_783:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 11
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	jne	.LBB0_784
+.LBB0_311:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	je	.LBB0_313
+.LBB0_312:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 13
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_313:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	je	.LBB0_315
+# %bb.314:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 14
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_315:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	je	.LBB0_317
+# %bb.316:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r10], 15
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_317:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm6, 1
+	vmovd	eax, xmm1
+	mov	dword ptr [rsp + 44], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_319
+# %bb.318:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rdx], 0
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_319:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpextrb	ecx, xmm1, 1
+	mov	dword ptr [rsp + 40], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_320
+# %bb.785:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rsi], 1
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_786
+.LBB0_321:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_322
+.LBB0_787:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r9], 3
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_788
+.LBB0_323:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_325
+.LBB0_324:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r13], 5
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_325:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpextrb	ecx, xmm1, 6
+	mov	dword ptr [rsp + 20], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_326
+# %bb.789:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 6
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	jne	.LBB0_790
+.LBB0_327:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	je	.LBB0_328
+.LBB0_791:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 8
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	jne	.LBB0_792
+.LBB0_329:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	je	.LBB0_330
+.LBB0_793:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 10
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	jne	.LBB0_794
+.LBB0_331:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	je	.LBB0_332
+.LBB0_795:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 12
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	jne	.LBB0_796
+.LBB0_333:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	je	.LBB0_334
+.LBB0_797:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 14
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	jne	.LBB0_335
+	jmp	.LBB0_336
+	.p2align	4, 0x90
+.LBB0_320:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_321
+.LBB0_786:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 2
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_787
+.LBB0_322:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_323
+.LBB0_788:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 4
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_324
+	jmp	.LBB0_325
+	.p2align	4, 0x90
+.LBB0_326:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	je	.LBB0_327
+.LBB0_790:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 7
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	jne	.LBB0_791
+.LBB0_328:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	je	.LBB0_329
+.LBB0_792:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 9
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	jne	.LBB0_793
+.LBB0_330:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	je	.LBB0_331
+.LBB0_794:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 11
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	jne	.LBB0_795
+.LBB0_332:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	je	.LBB0_333
+.LBB0_796:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 13
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	jne	.LBB0_797
+.LBB0_334:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	je	.LBB0_336
+.LBB0_335:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rbx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rbx], 15
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_336:                              #   in Loop: Header=BB0_26 Depth=1
+	vpsrlw	ymm1, ymm14, 3
+	vpand	ymm14, ymm1, ymmword ptr [rip + .LCPI0_4]
+	vmovd	r15d, xmm6
+	test	r15b, 1
+	je	.LBB0_337
+# %bb.798:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm15
+	vpextrb	byte ptr [r8 + rbx], xmm14, 0
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	jne	.LBB0_799
+.LBB0_338:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	je	.LBB0_339
+.LBB0_800:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 2
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	jne	.LBB0_801
+.LBB0_340:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	je	.LBB0_341
+.LBB0_802:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm5
+	vpextrb	byte ptr [r8 + rbx], xmm14, 4
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	jne	.LBB0_803
+.LBB0_342:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	je	.LBB0_343
+.LBB0_804:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 6
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	jne	.LBB0_805
+.LBB0_344:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	je	.LBB0_345
+.LBB0_806:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm12
+	vpextrb	byte ptr [r8 + rbx], xmm14, 8
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	jne	.LBB0_807
+.LBB0_346:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	je	.LBB0_347
+.LBB0_808:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 10
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	jne	.LBB0_809
+.LBB0_348:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	je	.LBB0_349
+.LBB0_810:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm11
+	vpextrb	byte ptr [r8 + rbx], xmm14, 12
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	jne	.LBB0_811
+.LBB0_350:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	je	.LBB0_351
+.LBB0_812:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 14
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	jne	.LBB0_813
+.LBB0_352:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	je	.LBB0_353
+.LBB0_814:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm10
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 0
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	jne	.LBB0_815
+.LBB0_354:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	je	.LBB0_355
+.LBB0_816:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 2
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	jne	.LBB0_817
+.LBB0_356:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	je	.LBB0_357
+.LBB0_818:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm9
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 4
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	jne	.LBB0_819
+.LBB0_358:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	je	.LBB0_359
+.LBB0_820:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 6
+	test	r9b, 1
+	jne	.LBB0_821
+.LBB0_360:                              #   in Loop: Header=BB0_26 Depth=1
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	je	.LBB0_361
+.LBB0_822:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rdx, xmm8
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rdx], xmm1, 8
+	test	cl, 1
+	jne	.LBB0_823
+.LBB0_362:                              #   in Loop: Header=BB0_26 Depth=1
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	je	.LBB0_363
+.LBB0_824:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	jne	.LBB0_825
+.LBB0_364:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r13b, 1
+	je	.LBB0_365
+.LBB0_826:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm7
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	jne	.LBB0_827
+.LBB0_366:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r11b, 1
+	je	.LBB0_367
+.LBB0_828:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	jne	.LBB0_368
+	jmp	.LBB0_369
+	.p2align	4, 0x90
+.LBB0_337:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	je	.LBB0_338
+.LBB0_799:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm15, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	jne	.LBB0_800
+.LBB0_339:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	je	.LBB0_340
+.LBB0_801:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 3
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	jne	.LBB0_802
+.LBB0_341:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	je	.LBB0_342
+.LBB0_803:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm5, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 5
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	jne	.LBB0_804
+.LBB0_343:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	je	.LBB0_344
+.LBB0_805:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 7
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	jne	.LBB0_806
+.LBB0_345:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	je	.LBB0_346
+.LBB0_807:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm12, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 9
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	jne	.LBB0_808
+.LBB0_347:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	je	.LBB0_348
+.LBB0_809:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 11
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	jne	.LBB0_810
+.LBB0_349:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	je	.LBB0_350
+.LBB0_811:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm11, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 13
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	jne	.LBB0_812
+.LBB0_351:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	je	.LBB0_352
+.LBB0_813:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 15
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	jne	.LBB0_814
+.LBB0_353:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	je	.LBB0_354
+.LBB0_815:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm10, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	jne	.LBB0_816
+.LBB0_355:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	je	.LBB0_356
+.LBB0_817:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 3
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	jne	.LBB0_818
+.LBB0_357:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	je	.LBB0_358
+.LBB0_819:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm9, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 5
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	jne	.LBB0_820
+.LBB0_359:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r9b, 1
+	je	.LBB0_360
+.LBB0_821:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 7
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	jne	.LBB0_822
+.LBB0_361:                              #   in Loop: Header=BB0_26 Depth=1
+	test	cl, 1
+	je	.LBB0_362
+.LBB0_823:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm8, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	jne	.LBB0_824
+.LBB0_363:                              #   in Loop: Header=BB0_26 Depth=1
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	je	.LBB0_364
+.LBB0_825:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 11
+	test	r13b, 1
+	jne	.LBB0_826
+.LBB0_365:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	je	.LBB0_366
+.LBB0_827:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm7, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+	test	r11b, 1
+	jne	.LBB0_828
+.LBB0_367:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	je	.LBB0_369
+.LBB0_368:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+.LBB0_369:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 672]   # 32-byte Reload
+	vpor	ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload
+	vpor	ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vpor	ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload
+	vpor	ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload
+	vpor	ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vpor	ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vpor	ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpor	ymm7, ymm4, ymm1
+	vperm2i128	ymm1, ymm8, ymm7, 49    # ymm1 = ymm8[2,3],ymm7[2,3]
+	vinserti128	ymm2, ymm8, xmm7, 1
+	vshufps	ymm1, ymm2, ymm1, 136           # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
+	vperm2i128	ymm2, ymm12, ymm11, 49  # ymm2 = ymm12[2,3],ymm11[2,3]
+	vinserti128	ymm3, ymm12, xmm11, 1
+	vshufps	ymm2, ymm3, ymm2, 136           # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+	vperm2i128	ymm3, ymm10, ymm9, 49   # ymm3 = ymm10[2,3],ymm9[2,3]
+	vinserti128	ymm13, ymm10, xmm9, 1
+	vshufps	ymm3, ymm13, ymm3, 136          # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6]
+	vperm2i128	ymm13, ymm15, ymm5, 49  # ymm13 = ymm15[2,3],ymm5[2,3]
+	vinserti128	ymm14, ymm15, xmm5, 1
+	vshufps	ymm13, ymm14, ymm13, 136        # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6]
+	vpcmpgtd	ymm13, ymm0, ymm13
+	vpcmpgtd	ymm3, ymm0, ymm3
+	vpackssdw	ymm3, ymm13, ymm3
+	vpcmpgtd	ymm2, ymm0, ymm2
+	vpcmpgtd	ymm1, ymm0, ymm1
+	vpackssdw	ymm1, ymm2, ymm1
+	vpermq	ymm2, ymm3, 216                 # ymm2 = ymm3[0,2,1,3]
+	vpermq	ymm1, ymm1, 216                 # ymm1 = ymm1[0,2,1,3]
+	vpacksswb	ymm1, ymm2, ymm1
+	vpand	ymm6, ymm1, ymm6
+	vmovd	ecx, xmm6
+                                        # implicit-def: $ymm14
+	test	cl, 1
+	je	.LBB0_370
+# %bb.829:                              #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm14, byte ptr [rdi + rdx]
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	jne	.LBB0_830
+.LBB0_371:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	je	.LBB0_373
+.LBB0_372:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rbx], 2
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_373:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, qword ptr [rsp + 96]       # 8-byte Reload
+	mov	r10, qword ptr [rsp + 72]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	je	.LBB0_374
+# %bb.831:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 272]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 3
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	jne	.LBB0_832
+.LBB0_375:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	je	.LBB0_376
+.LBB0_833:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 5
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	jne	.LBB0_834
+.LBB0_377:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	je	.LBB0_378
+.LBB0_835:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r9], 7
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	jne	.LBB0_836
+.LBB0_379:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_381
+.LBB0_380:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r15], 9
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_381:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	mov	rsi, qword ptr [rsp + 136]      # 8-byte Reload
+	mov	rbx, qword ptr [rsp + 128]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 120]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	je	.LBB0_382
+# %bb.837:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 10
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	jne	.LBB0_838
+.LBB0_383:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	je	.LBB0_384
+.LBB0_839:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 12
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	jne	.LBB0_385
+	jmp	.LBB0_386
+	.p2align	4, 0x90
+.LBB0_370:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	je	.LBB0_371
+.LBB0_830:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 1
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	jne	.LBB0_372
+	jmp	.LBB0_373
+	.p2align	4, 0x90
+.LBB0_374:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	je	.LBB0_375
+.LBB0_832:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 4
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	jne	.LBB0_833
+.LBB0_376:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	je	.LBB0_377
+.LBB0_834:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 6
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	jne	.LBB0_835
+.LBB0_378:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	je	.LBB0_379
+.LBB0_836:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 8
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_380
+	jmp	.LBB0_381
+	.p2align	4, 0x90
+.LBB0_382:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	je	.LBB0_383
+.LBB0_838:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 11
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	jne	.LBB0_839
+.LBB0_384:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	je	.LBB0_386
+.LBB0_385:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 13
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_386:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	je	.LBB0_388
+# %bb.387:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 14
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_388:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	je	.LBB0_390
+# %bb.389:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r10], 15
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_390:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm6, 1
+	vmovd	eax, xmm1
+	mov	dword ptr [rsp + 44], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_392
+# %bb.391:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rdx], 0
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_392:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpextrb	ecx, xmm1, 1
+	mov	dword ptr [rsp + 40], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_393
+# %bb.840:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rsi], 1
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_841
+.LBB0_394:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_395
+.LBB0_842:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r9], 3
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_843
+.LBB0_396:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_398
+.LBB0_397:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r13], 5
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_398:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpextrb	ecx, xmm1, 6
+	mov	dword ptr [rsp + 20], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_399
+# %bb.844:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 6
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	jne	.LBB0_845
+.LBB0_400:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	je	.LBB0_401
+.LBB0_846:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 8
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	jne	.LBB0_847
+.LBB0_402:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	je	.LBB0_403
+.LBB0_848:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 10
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	jne	.LBB0_849
+.LBB0_404:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	je	.LBB0_405
+.LBB0_850:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 12
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	jne	.LBB0_851
+.LBB0_406:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	je	.LBB0_407
+.LBB0_852:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 14
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	jne	.LBB0_408
+	jmp	.LBB0_409
+	.p2align	4, 0x90
+.LBB0_393:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_394
+.LBB0_841:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 2
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_842
+.LBB0_395:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_396
+.LBB0_843:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 4
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_397
+	jmp	.LBB0_398
+	.p2align	4, 0x90
+.LBB0_399:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	je	.LBB0_400
+.LBB0_845:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 7
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	jne	.LBB0_846
+.LBB0_401:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	je	.LBB0_402
+.LBB0_847:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 9
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	jne	.LBB0_848
+.LBB0_403:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	je	.LBB0_404
+.LBB0_849:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 11
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	jne	.LBB0_850
+.LBB0_405:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	je	.LBB0_406
+.LBB0_851:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 13
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	jne	.LBB0_852
+.LBB0_407:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	je	.LBB0_409
+.LBB0_408:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rbx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rbx], 15
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_409:                              #   in Loop: Header=BB0_26 Depth=1
+	vpsrlw	ymm1, ymm14, 4
+	vpand	ymm14, ymm1, ymmword ptr [rip + .LCPI0_4]
+	vmovd	r15d, xmm6
+	test	r15b, 1
+	je	.LBB0_410
+# %bb.853:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm15
+	vpextrb	byte ptr [r8 + rbx], xmm14, 0
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	jne	.LBB0_854
+.LBB0_411:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	je	.LBB0_412
+.LBB0_855:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 2
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	jne	.LBB0_856
+.LBB0_413:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	je	.LBB0_414
+.LBB0_857:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm5
+	vpextrb	byte ptr [r8 + rbx], xmm14, 4
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	jne	.LBB0_858
+.LBB0_415:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	je	.LBB0_416
+.LBB0_859:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 6
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	jne	.LBB0_860
+.LBB0_417:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	je	.LBB0_418
+.LBB0_861:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm12
+	vpextrb	byte ptr [r8 + rbx], xmm14, 8
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	jne	.LBB0_862
+.LBB0_419:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	je	.LBB0_420
+.LBB0_863:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 10
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	jne	.LBB0_864
+.LBB0_421:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	je	.LBB0_422
+.LBB0_865:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm11
+	vpextrb	byte ptr [r8 + rbx], xmm14, 12
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	jne	.LBB0_866
+.LBB0_423:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	je	.LBB0_424
+.LBB0_867:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 14
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	jne	.LBB0_868
+.LBB0_425:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	je	.LBB0_426
+.LBB0_869:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm10
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 0
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	jne	.LBB0_870
+.LBB0_427:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	je	.LBB0_428
+.LBB0_871:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 2
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	jne	.LBB0_872
+.LBB0_429:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	je	.LBB0_430
+.LBB0_873:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm9
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 4
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	jne	.LBB0_874
+.LBB0_431:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	je	.LBB0_432
+.LBB0_875:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 6
+	test	r9b, 1
+	jne	.LBB0_876
+.LBB0_433:                              #   in Loop: Header=BB0_26 Depth=1
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	je	.LBB0_434
+.LBB0_877:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rdx, xmm8
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rdx], xmm1, 8
+	test	cl, 1
+	jne	.LBB0_878
+.LBB0_435:                              #   in Loop: Header=BB0_26 Depth=1
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	je	.LBB0_436
+.LBB0_879:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	jne	.LBB0_880
+.LBB0_437:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r13b, 1
+	je	.LBB0_438
+.LBB0_881:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm7
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	jne	.LBB0_882
+.LBB0_439:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r11b, 1
+	je	.LBB0_440
+.LBB0_883:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	jne	.LBB0_441
+	jmp	.LBB0_442
+	.p2align	4, 0x90
+.LBB0_410:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	je	.LBB0_411
+.LBB0_854:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm15, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	jne	.LBB0_855
+.LBB0_412:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	je	.LBB0_413
+.LBB0_856:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 3
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	jne	.LBB0_857
+.LBB0_414:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	je	.LBB0_415
+.LBB0_858:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm5, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 5
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	jne	.LBB0_859
+.LBB0_416:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	je	.LBB0_417
+.LBB0_860:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 7
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	jne	.LBB0_861
+.LBB0_418:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	je	.LBB0_419
+.LBB0_862:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm12, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 9
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	jne	.LBB0_863
+.LBB0_420:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	je	.LBB0_421
+.LBB0_864:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 11
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	jne	.LBB0_865
+.LBB0_422:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	je	.LBB0_423
+.LBB0_866:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm11, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 13
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	jne	.LBB0_867
+.LBB0_424:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	je	.LBB0_425
+.LBB0_868:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 15
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	jne	.LBB0_869
+.LBB0_426:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	je	.LBB0_427
+.LBB0_870:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm10, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	jne	.LBB0_871
+.LBB0_428:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	je	.LBB0_429
+.LBB0_872:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 3
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	jne	.LBB0_873
+.LBB0_430:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	je	.LBB0_431
+.LBB0_874:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm9, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 5
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	jne	.LBB0_875
+.LBB0_432:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r9b, 1
+	je	.LBB0_433
+.LBB0_876:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 7
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	jne	.LBB0_877
+.LBB0_434:                              #   in Loop: Header=BB0_26 Depth=1
+	test	cl, 1
+	je	.LBB0_435
+.LBB0_878:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm8, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	jne	.LBB0_879
+.LBB0_436:                              #   in Loop: Header=BB0_26 Depth=1
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	je	.LBB0_437
+.LBB0_880:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 11
+	test	r13b, 1
+	jne	.LBB0_881
+.LBB0_438:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	je	.LBB0_439
+.LBB0_882:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm7, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+	test	r11b, 1
+	jne	.LBB0_883
+.LBB0_440:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	je	.LBB0_442
+.LBB0_441:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+.LBB0_442:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 640]   # 32-byte Reload
+	vpor	ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload
+	vpor	ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vpor	ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload
+	vpor	ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload
+	vpor	ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vpor	ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vpor	ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpor	ymm7, ymm4, ymm1
+	vperm2i128	ymm1, ymm8, ymm7, 49    # ymm1 = ymm8[2,3],ymm7[2,3]
+	vinserti128	ymm2, ymm8, xmm7, 1
+	vshufps	ymm1, ymm2, ymm1, 136           # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
+	vperm2i128	ymm2, ymm12, ymm11, 49  # ymm2 = ymm12[2,3],ymm11[2,3]
+	vinserti128	ymm3, ymm12, xmm11, 1
+	vshufps	ymm2, ymm3, ymm2, 136           # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+	vperm2i128	ymm3, ymm10, ymm9, 49   # ymm3 = ymm10[2,3],ymm9[2,3]
+	vinserti128	ymm13, ymm10, xmm9, 1
+	vshufps	ymm3, ymm13, ymm3, 136          # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6]
+	vperm2i128	ymm13, ymm15, ymm5, 49  # ymm13 = ymm15[2,3],ymm5[2,3]
+	vinserti128	ymm14, ymm15, xmm5, 1
+	vshufps	ymm13, ymm14, ymm13, 136        # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6]
+	vpcmpgtd	ymm13, ymm0, ymm13
+	vpcmpgtd	ymm3, ymm0, ymm3
+	vpackssdw	ymm3, ymm13, ymm3
+	vpcmpgtd	ymm2, ymm0, ymm2
+	vpcmpgtd	ymm1, ymm0, ymm1
+	vpackssdw	ymm1, ymm2, ymm1
+	vpermq	ymm2, ymm3, 216                 # ymm2 = ymm3[0,2,1,3]
+	vpermq	ymm1, ymm1, 216                 # ymm1 = ymm1[0,2,1,3]
+	vpacksswb	ymm1, ymm2, ymm1
+	vpand	ymm6, ymm1, ymm6
+	vmovd	ecx, xmm6
+                                        # implicit-def: $ymm14
+	test	cl, 1
+	je	.LBB0_443
+# %bb.884:                              #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm14, byte ptr [rdi + rdx]
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	jne	.LBB0_885
+.LBB0_444:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	je	.LBB0_446
+.LBB0_445:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rbx], 2
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_446:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, qword ptr [rsp + 96]       # 8-byte Reload
+	mov	r10, qword ptr [rsp + 72]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	je	.LBB0_447
+# %bb.886:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 272]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 3
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	jne	.LBB0_887
+.LBB0_448:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	je	.LBB0_449
+.LBB0_888:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 5
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	jne	.LBB0_889
+.LBB0_450:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	je	.LBB0_451
+.LBB0_890:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r9], 7
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	jne	.LBB0_891
+.LBB0_452:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_454
+.LBB0_453:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r15], 9
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_454:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	mov	rsi, qword ptr [rsp + 136]      # 8-byte Reload
+	mov	rbx, qword ptr [rsp + 128]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 120]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	je	.LBB0_455
+# %bb.892:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 10
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	jne	.LBB0_893
+.LBB0_456:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	je	.LBB0_457
+.LBB0_894:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 12
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	jne	.LBB0_458
+	jmp	.LBB0_459
+	.p2align	4, 0x90
+.LBB0_443:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	je	.LBB0_444
+.LBB0_885:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 1
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	jne	.LBB0_445
+	jmp	.LBB0_446
+	.p2align	4, 0x90
+.LBB0_447:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	je	.LBB0_448
+.LBB0_887:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 4
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	jne	.LBB0_888
+.LBB0_449:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	je	.LBB0_450
+.LBB0_889:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 6
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	jne	.LBB0_890
+.LBB0_451:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	je	.LBB0_452
+.LBB0_891:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 8
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_453
+	jmp	.LBB0_454
+	.p2align	4, 0x90
+.LBB0_455:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	je	.LBB0_456
+.LBB0_893:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 11
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	jne	.LBB0_894
+.LBB0_457:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	je	.LBB0_459
+.LBB0_458:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 13
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_459:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	je	.LBB0_461
+# %bb.460:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 14
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_461:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	je	.LBB0_463
+# %bb.462:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r10], 15
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_463:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm6, 1
+	vmovd	eax, xmm1
+	mov	dword ptr [rsp + 44], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_465
+# %bb.464:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rdx], 0
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_465:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpextrb	ecx, xmm1, 1
+	mov	dword ptr [rsp + 40], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_466
+# %bb.895:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rsi], 1
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_896
+.LBB0_467:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_468
+.LBB0_897:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r9], 3
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_898
+.LBB0_469:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_471
+.LBB0_470:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r13], 5
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_471:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpextrb	ecx, xmm1, 6
+	mov	dword ptr [rsp + 20], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_472
+# %bb.899:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 6
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	jne	.LBB0_900
+.LBB0_473:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	je	.LBB0_474
+.LBB0_901:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 8
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	jne	.LBB0_902
+.LBB0_475:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	je	.LBB0_476
+.LBB0_903:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 10
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	jne	.LBB0_904
+.LBB0_477:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	je	.LBB0_478
+.LBB0_905:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 12
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	jne	.LBB0_906
+.LBB0_479:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	je	.LBB0_480
+.LBB0_907:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 14
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	jne	.LBB0_481
+	jmp	.LBB0_482
+	.p2align	4, 0x90
+.LBB0_466:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_467
+.LBB0_896:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 2
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_897
+.LBB0_468:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_469
+.LBB0_898:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 4
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_470
+	jmp	.LBB0_471
+	.p2align	4, 0x90
+.LBB0_472:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	je	.LBB0_473
+.LBB0_900:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 7
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	jne	.LBB0_901
+.LBB0_474:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	je	.LBB0_475
+.LBB0_902:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 9
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	jne	.LBB0_903
+.LBB0_476:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	je	.LBB0_477
+.LBB0_904:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 11
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	jne	.LBB0_905
+.LBB0_478:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	je	.LBB0_479
+.LBB0_906:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 13
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	jne	.LBB0_907
+.LBB0_480:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	je	.LBB0_482
+.LBB0_481:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rbx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rbx], 15
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_482:                              #   in Loop: Header=BB0_26 Depth=1
+	vpsrlw	ymm1, ymm14, 5
+	vpand	ymm14, ymm1, ymmword ptr [rip + .LCPI0_4]
+	vmovd	r15d, xmm6
+	test	r15b, 1
+	je	.LBB0_483
+# %bb.908:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm15
+	vpextrb	byte ptr [r8 + rbx], xmm14, 0
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	jne	.LBB0_909
+.LBB0_484:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	je	.LBB0_485
+.LBB0_910:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 2
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	jne	.LBB0_911
+.LBB0_486:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	je	.LBB0_487
+.LBB0_912:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm5
+	vpextrb	byte ptr [r8 + rbx], xmm14, 4
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	jne	.LBB0_913
+.LBB0_488:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	je	.LBB0_489
+.LBB0_914:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 6
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	jne	.LBB0_915
+.LBB0_490:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	je	.LBB0_491
+.LBB0_916:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm12
+	vpextrb	byte ptr [r8 + rbx], xmm14, 8
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	jne	.LBB0_917
+.LBB0_492:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	je	.LBB0_493
+.LBB0_918:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 10
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	jne	.LBB0_919
+.LBB0_494:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	je	.LBB0_495
+.LBB0_920:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm11
+	vpextrb	byte ptr [r8 + rbx], xmm14, 12
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	jne	.LBB0_921
+.LBB0_496:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	je	.LBB0_497
+.LBB0_922:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 14
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	jne	.LBB0_923
+.LBB0_498:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	je	.LBB0_499
+.LBB0_924:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm10
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 0
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	jne	.LBB0_925
+.LBB0_500:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	je	.LBB0_501
+.LBB0_926:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 2
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	jne	.LBB0_927
+.LBB0_502:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	je	.LBB0_503
+.LBB0_928:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm9
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 4
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	jne	.LBB0_929
+.LBB0_504:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	je	.LBB0_505
+.LBB0_930:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 6
+	test	r9b, 1
+	jne	.LBB0_931
+.LBB0_506:                              #   in Loop: Header=BB0_26 Depth=1
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	je	.LBB0_507
+.LBB0_932:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rdx, xmm8
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rdx], xmm1, 8
+	test	cl, 1
+	jne	.LBB0_933
+.LBB0_508:                              #   in Loop: Header=BB0_26 Depth=1
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	je	.LBB0_509
+.LBB0_934:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	jne	.LBB0_935
+.LBB0_510:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r13b, 1
+	je	.LBB0_511
+.LBB0_936:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm7
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	jne	.LBB0_937
+.LBB0_512:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r11b, 1
+	je	.LBB0_513
+.LBB0_938:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	jne	.LBB0_514
+	jmp	.LBB0_515
+	.p2align	4, 0x90
+.LBB0_483:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	je	.LBB0_484
+.LBB0_909:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm15, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	jne	.LBB0_910
+.LBB0_485:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	je	.LBB0_486
+.LBB0_911:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 3
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	jne	.LBB0_912
+.LBB0_487:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	je	.LBB0_488
+.LBB0_913:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm5, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 5
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	jne	.LBB0_914
+.LBB0_489:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	je	.LBB0_490
+.LBB0_915:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 7
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	jne	.LBB0_916
+.LBB0_491:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	je	.LBB0_492
+.LBB0_917:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm12, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 9
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	jne	.LBB0_918
+.LBB0_493:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	je	.LBB0_494
+.LBB0_919:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 11
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	jne	.LBB0_920
+.LBB0_495:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	je	.LBB0_496
+.LBB0_921:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm11, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 13
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	jne	.LBB0_922
+.LBB0_497:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	je	.LBB0_498
+.LBB0_923:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 15
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	jne	.LBB0_924
+.LBB0_499:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	je	.LBB0_500
+.LBB0_925:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm10, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	jne	.LBB0_926
+.LBB0_501:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	je	.LBB0_502
+.LBB0_927:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 3
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	jne	.LBB0_928
+.LBB0_503:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	je	.LBB0_504
+.LBB0_929:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm9, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 5
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	jne	.LBB0_930
+.LBB0_505:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r9b, 1
+	je	.LBB0_506
+.LBB0_931:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 7
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	jne	.LBB0_932
+.LBB0_507:                              #   in Loop: Header=BB0_26 Depth=1
+	test	cl, 1
+	je	.LBB0_508
+.LBB0_933:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm8, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	jne	.LBB0_934
+.LBB0_509:                              #   in Loop: Header=BB0_26 Depth=1
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	je	.LBB0_510
+.LBB0_935:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 11
+	test	r13b, 1
+	jne	.LBB0_936
+.LBB0_511:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	je	.LBB0_512
+.LBB0_937:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm7, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+	test	r11b, 1
+	jne	.LBB0_938
+.LBB0_513:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	je	.LBB0_515
+.LBB0_514:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+.LBB0_515:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 608]   # 32-byte Reload
+	vpor	ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload
+	vpor	ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vpor	ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload
+	vpor	ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload
+	vpor	ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vpor	ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vpor	ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpor	ymm7, ymm4, ymm1
+	vperm2i128	ymm1, ymm8, ymm7, 49    # ymm1 = ymm8[2,3],ymm7[2,3]
+	vinserti128	ymm2, ymm8, xmm7, 1
+	vshufps	ymm1, ymm2, ymm1, 136           # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
+	vperm2i128	ymm2, ymm12, ymm11, 49  # ymm2 = ymm12[2,3],ymm11[2,3]
+	vinserti128	ymm3, ymm12, xmm11, 1
+	vshufps	ymm2, ymm3, ymm2, 136           # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+	vperm2i128	ymm3, ymm10, ymm9, 49   # ymm3 = ymm10[2,3],ymm9[2,3]
+	vinserti128	ymm13, ymm10, xmm9, 1
+	vshufps	ymm3, ymm13, ymm3, 136          # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6]
+	vperm2i128	ymm13, ymm15, ymm5, 49  # ymm13 = ymm15[2,3],ymm5[2,3]
+	vinserti128	ymm14, ymm15, xmm5, 1
+	vshufps	ymm13, ymm14, ymm13, 136        # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6]
+	vpcmpgtd	ymm13, ymm0, ymm13
+	vpcmpgtd	ymm3, ymm0, ymm3
+	vpackssdw	ymm3, ymm13, ymm3
+	vpcmpgtd	ymm2, ymm0, ymm2
+	vpcmpgtd	ymm1, ymm0, ymm1
+	vpackssdw	ymm1, ymm2, ymm1
+	vpermq	ymm2, ymm3, 216                 # ymm2 = ymm3[0,2,1,3]
+	vpermq	ymm1, ymm1, 216                 # ymm1 = ymm1[0,2,1,3]
+	vpacksswb	ymm1, ymm2, ymm1
+	vpand	ymm6, ymm1, ymm6
+	vmovd	ecx, xmm6
+                                        # implicit-def: $ymm14
+	test	cl, 1
+	je	.LBB0_516
+# %bb.939:                              #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm14, byte ptr [rdi + rdx]
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	jne	.LBB0_940
+.LBB0_517:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	je	.LBB0_519
+.LBB0_518:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rbx], 2
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_519:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, qword ptr [rsp + 96]       # 8-byte Reload
+	mov	r10, qword ptr [rsp + 72]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 3
+	test	cl, 1
+	je	.LBB0_520
+# %bb.941:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 272]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 3
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	jne	.LBB0_942
+.LBB0_521:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	je	.LBB0_522
+.LBB0_943:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 5
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	jne	.LBB0_944
+.LBB0_523:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	je	.LBB0_524
+.LBB0_945:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r9], 7
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	jne	.LBB0_946
+.LBB0_525:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_527
+.LBB0_526:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r15], 9
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_527:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	mov	rsi, qword ptr [rsp + 136]      # 8-byte Reload
+	mov	rbx, qword ptr [rsp + 128]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 120]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 10
+	test	cl, 1
+	je	.LBB0_528
+# %bb.947:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 10
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	jne	.LBB0_948
+.LBB0_529:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	je	.LBB0_530
+.LBB0_949:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 12
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	jne	.LBB0_531
+	jmp	.LBB0_532
+	.p2align	4, 0x90
+.LBB0_516:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 1
+	test	cl, 1
+	je	.LBB0_517
+.LBB0_940:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 1
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 2
+	test	cl, 1
+	jne	.LBB0_518
+	jmp	.LBB0_519
+	.p2align	4, 0x90
+.LBB0_520:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	test	cl, 1
+	je	.LBB0_521
+.LBB0_942:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rcx], 4
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 5
+	test	cl, 1
+	jne	.LBB0_943
+.LBB0_522:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 6
+	test	cl, 1
+	je	.LBB0_523
+.LBB0_944:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 6
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 7
+	test	cl, 1
+	jne	.LBB0_945
+.LBB0_524:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 8
+	test	cl, 1
+	je	.LBB0_525
+.LBB0_946:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rsi], 8
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_526
+	jmp	.LBB0_527
+	.p2align	4, 0x90
+.LBB0_528:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 11
+	test	cl, 1
+	je	.LBB0_529
+.LBB0_948:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 11
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+	vpextrb	ecx, xmm6, 12
+	test	cl, 1
+	jne	.LBB0_949
+.LBB0_530:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 13
+	test	cl, 1
+	je	.LBB0_532
+.LBB0_531:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rdx], 13
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_532:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 14
+	test	cl, 1
+	je	.LBB0_534
+# %bb.533:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + rax], 14
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_534:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 15
+	test	cl, 1
+	je	.LBB0_536
+# %bb.535:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm1, xmm14, byte ptr [rdi + r10], 15
+	vpblendd	ymm14, ymm14, ymm1, 15          # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+.LBB0_536:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm6, 1
+	vmovd	eax, xmm1
+	mov	dword ptr [rsp + 44], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_538
+# %bb.537:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rdx], 0
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_538:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpextrb	ecx, xmm1, 1
+	mov	dword ptr [rsp + 40], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_539
+# %bb.950:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rsi], 1
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_951
+.LBB0_540:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_541
+.LBB0_952:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r9], 3
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_953
+.LBB0_542:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	je	.LBB0_544
+.LBB0_543:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + r13], 5
+	vinserti128	ymm14, ymm14, xmm2, 1
+.LBB0_544:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpextrb	ecx, xmm1, 6
+	mov	dword ptr [rsp + 20], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_545
+# %bb.954:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 6
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	jne	.LBB0_955
+.LBB0_546:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	je	.LBB0_547
+.LBB0_956:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 8
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	jne	.LBB0_957
+.LBB0_548:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	je	.LBB0_549
+.LBB0_958:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 10
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	jne	.LBB0_959
+.LBB0_550:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	je	.LBB0_551
+.LBB0_960:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 12
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	jne	.LBB0_961
+.LBB0_552:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	je	.LBB0_553
+.LBB0_962:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 14
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	jne	.LBB0_554
+	jmp	.LBB0_555
+	.p2align	4, 0x90
+.LBB0_539:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 2
+	mov	dword ptr [rsp + 36], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_540
+.LBB0_951:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 2
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	ecx, xmm1, 3
+	mov	dword ptr [rsp + 32], ecx       # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_952
+.LBB0_541:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 4
+	mov	dword ptr [rsp + 28], ecx       # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_542
+.LBB0_953:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 4
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	eax, xmm1, 5
+	mov	dword ptr [rsp + 24], eax       # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_543
+	jmp	.LBB0_544
+	.p2align	4, 0x90
+.LBB0_545:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r9d, xmm1, 7
+	test	r9b, 1
+	je	.LBB0_546
+.LBB0_955:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 7
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	edx, xmm1, 8
+	test	dl, 1
+	jne	.LBB0_956
+.LBB0_547:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm1, 9
+	test	cl, 1
+	je	.LBB0_548
+.LBB0_957:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rax], 9
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	esi, xmm1, 10
+	test	sil, 1
+	jne	.LBB0_958
+.LBB0_549:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm1, 11
+	test	al, 1
+	je	.LBB0_550
+.LBB0_959:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 11
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r13d, xmm1, 12
+	test	r13b, 1
+	jne	.LBB0_960
+.LBB0_551:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r10d, xmm1, 13
+	test	r10b, 1
+	je	.LBB0_552
+.LBB0_961:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm2, ymm14, 1
+	mov	rbx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rdi + rbx], 13
+	vinserti128	ymm14, ymm14, xmm2, 1
+	vpextrb	r11d, xmm1, 14
+	test	r11b, 1
+	jne	.LBB0_962
+.LBB0_553:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r14d, xmm1, 15
+	test	r14b, 1
+	je	.LBB0_555
+.LBB0_554:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm14, 1
+	mov	rbx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rbx], 15
+	vinserti128	ymm14, ymm14, xmm1, 1
+.LBB0_555:                              #   in Loop: Header=BB0_26 Depth=1
+	vpsrlw	ymm1, ymm14, 6
+	vpand	ymm14, ymm1, ymmword ptr [rip + .LCPI0_4]
+	vmovd	r15d, xmm6
+	test	r15b, 1
+	je	.LBB0_556
+# %bb.963:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm15
+	vpextrb	byte ptr [r8 + rbx], xmm14, 0
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	jne	.LBB0_964
+.LBB0_557:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	je	.LBB0_558
+.LBB0_965:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 2
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	jne	.LBB0_966
+.LBB0_559:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	je	.LBB0_560
+.LBB0_967:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm5
+	vpextrb	byte ptr [r8 + rbx], xmm14, 4
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	jne	.LBB0_968
+.LBB0_561:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	je	.LBB0_562
+.LBB0_969:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 6
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	jne	.LBB0_970
+.LBB0_563:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	je	.LBB0_564
+.LBB0_971:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm12
+	vpextrb	byte ptr [r8 + rbx], xmm14, 8
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	jne	.LBB0_972
+.LBB0_565:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	je	.LBB0_566
+.LBB0_973:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 10
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	jne	.LBB0_974
+.LBB0_567:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	je	.LBB0_568
+.LBB0_975:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm11
+	vpextrb	byte ptr [r8 + rbx], xmm14, 12
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	jne	.LBB0_976
+.LBB0_569:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	je	.LBB0_570
+.LBB0_977:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 14
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	jne	.LBB0_978
+.LBB0_571:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	je	.LBB0_572
+.LBB0_979:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm10
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 0
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	jne	.LBB0_980
+.LBB0_573:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	je	.LBB0_574
+.LBB0_981:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 2
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	jne	.LBB0_982
+.LBB0_575:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	je	.LBB0_576
+.LBB0_983:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm9
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 4
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	jne	.LBB0_984
+.LBB0_577:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	je	.LBB0_578
+.LBB0_985:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 6
+	test	r9b, 1
+	jne	.LBB0_986
+.LBB0_579:                              #   in Loop: Header=BB0_26 Depth=1
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	je	.LBB0_580
+.LBB0_987:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rdx, xmm8
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rdx], xmm1, 8
+	test	cl, 1
+	jne	.LBB0_988
+.LBB0_581:                              #   in Loop: Header=BB0_26 Depth=1
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	je	.LBB0_582
+.LBB0_989:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	jne	.LBB0_990
+.LBB0_583:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r13b, 1
+	je	.LBB0_584
+.LBB0_991:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm7
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	jne	.LBB0_992
+.LBB0_585:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r11b, 1
+	je	.LBB0_586
+.LBB0_993:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	jne	.LBB0_587
+	jmp	.LBB0_588
+	.p2align	4, 0x90
+.LBB0_556:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 1
+	test	bl, 1
+	je	.LBB0_557
+.LBB0_964:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm15, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 1
+	vpextrb	ebx, xmm6, 2
+	test	bl, 1
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	jne	.LBB0_965
+.LBB0_558:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 3
+	test	bl, 1
+	je	.LBB0_559
+.LBB0_966:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 3
+	vpextrb	ebx, xmm6, 4
+	test	bl, 1
+	jne	.LBB0_967
+.LBB0_560:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 5
+	test	bl, 1
+	je	.LBB0_561
+.LBB0_968:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm5, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 5
+	vpextrb	ebx, xmm6, 6
+	test	bl, 1
+	jne	.LBB0_969
+.LBB0_562:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 7
+	test	bl, 1
+	je	.LBB0_563
+.LBB0_970:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 7
+	vpextrb	ebx, xmm6, 8
+	test	bl, 1
+	jne	.LBB0_971
+.LBB0_564:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 9
+	test	bl, 1
+	je	.LBB0_565
+.LBB0_972:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm12, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 9
+	vpextrb	ebx, xmm6, 10
+	test	bl, 1
+	jne	.LBB0_973
+.LBB0_566:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 11
+	test	bl, 1
+	je	.LBB0_567
+.LBB0_974:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm12, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 11
+	vpextrb	ebx, xmm6, 12
+	test	bl, 1
+	jne	.LBB0_975
+.LBB0_568:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 13
+	test	bl, 1
+	je	.LBB0_569
+.LBB0_976:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm11, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 13
+	vpextrb	ebx, xmm6, 14
+	test	bl, 1
+	jne	.LBB0_977
+.LBB0_570:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm6, 15
+	test	bl, 1
+	je	.LBB0_571
+.LBB0_978:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm14, 15
+	test	byte ptr [rsp + 44], 1          # 1-byte Folded Reload
+	jne	.LBB0_979
+.LBB0_572:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 40], 1          # 1-byte Folded Reload
+	je	.LBB0_573
+.LBB0_980:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm10, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 1
+	test	byte ptr [rsp + 36], 1          # 1-byte Folded Reload
+	jne	.LBB0_981
+.LBB0_574:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 32], 1          # 1-byte Folded Reload
+	je	.LBB0_575
+.LBB0_982:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 3
+	test	byte ptr [rsp + 28], 1          # 1-byte Folded Reload
+	jne	.LBB0_983
+.LBB0_576:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 24], 1          # 1-byte Folded Reload
+	je	.LBB0_577
+.LBB0_984:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm9, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 5
+	test	byte ptr [rsp + 20], 1          # 1-byte Folded Reload
+	jne	.LBB0_985
+.LBB0_578:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r9b, 1
+	je	.LBB0_579
+.LBB0_986:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 7
+	test	dl, 1
+	mov	rbx, qword ptr [rsp + 296]      # 8-byte Reload
+	jne	.LBB0_987
+.LBB0_580:                              #   in Loop: Header=BB0_26 Depth=1
+	test	cl, 1
+	je	.LBB0_581
+.LBB0_988:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm8, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+	test	sil, 1
+	mov	rdx, qword ptr [rsp + 304]      # 8-byte Reload
+	jne	.LBB0_989
+.LBB0_582:                              #   in Loop: Header=BB0_26 Depth=1
+	test	al, 1
+	mov	rsi, qword ptr [rsp + 152]      # 8-byte Reload
+	je	.LBB0_583
+.LBB0_990:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 11
+	test	r13b, 1
+	jne	.LBB0_991
+.LBB0_584:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r10b, 1
+	mov	r13, qword ptr [rsp + 280]      # 8-byte Reload
+	je	.LBB0_585
+.LBB0_992:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm7, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+	test	r11b, 1
+	jne	.LBB0_993
+.LBB0_586:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r14b, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	je	.LBB0_588
+.LBB0_587:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm14, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+.LBB0_588:                              #   in Loop: Header=BB0_26 Depth=1
+	vmovdqa	ymm1, ymmword ptr [rsp + 576]   # 32-byte Reload
+	vpor	ymm11, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload
+	vpor	ymm10, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vpor	ymm8, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload
+	vpor	ymm7, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload
+	vpor	ymm9, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vpor	ymm5, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vpor	ymm2, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpor	ymm15, ymm4, ymm1
+	vperm2i128	ymm3, ymm2, ymm15, 49   # ymm3 = ymm2[2,3],ymm15[2,3]
+	vinserti128	ymm4, ymm2, xmm15, 1
+	vshufps	ymm3, ymm4, ymm3, 136           # ymm3 = ymm4[0,2],ymm3[0,2],ymm4[4,6],ymm3[4,6]
+	vperm2i128	ymm4, ymm9, ymm5, 49    # ymm4 = ymm9[2,3],ymm5[2,3]
+	vinserti128	ymm12, ymm9, xmm5, 1
+	vshufps	ymm4, ymm12, ymm4, 136          # ymm4 = ymm12[0,2],ymm4[0,2],ymm12[4,6],ymm4[4,6]
+	vperm2i128	ymm12, ymm8, ymm7, 49   # ymm12 = ymm8[2,3],ymm7[2,3]
+	vinserti128	ymm13, ymm8, xmm7, 1
+	vshufps	ymm12, ymm13, ymm12, 136        # ymm12 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6]
+	vperm2i128	ymm13, ymm11, ymm10, 49 # ymm13 = ymm11[2,3],ymm10[2,3]
+	vinserti128	ymm14, ymm11, xmm10, 1
+	vshufps	ymm13, ymm14, ymm13, 136        # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6]
+	vpcmpgtd	ymm13, ymm0, ymm13
+	vpcmpgtd	ymm12, ymm0, ymm12
+	vpackssdw	ymm12, ymm13, ymm12
+	vpermq	ymm12, ymm12, 216               # ymm12 = ymm12[0,2,1,3]
+	vpcmpgtd	ymm4, ymm0, ymm4
+	vpcmpgtd	ymm3, ymm0, ymm3
+	vpackssdw	ymm3, ymm4, ymm3
+	vpermq	ymm3, ymm3, 216                 # ymm3 = ymm3[0,2,1,3]
+	vpacksswb	ymm3, ymm12, ymm3
+	vpand	ymm3, ymm3, ymm6
+	vmovd	ecx, xmm3
+                                        # implicit-def: $ymm4
+	test	cl, 1
+	je	.LBB0_589
+# %bb.994:                              #   in Loop: Header=BB0_26 Depth=1
+	vpbroadcastb	ymm4, byte ptr [rdi + rdx]
+	vpextrb	ecx, xmm3, 1
+	test	cl, 1
+	jne	.LBB0_995
+.LBB0_590:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm3, 2
+	test	cl, 1
+	je	.LBB0_592
+.LBB0_591:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rbx], 2
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_592:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rsi, qword ptr [rsp + 96]       # 8-byte Reload
+	mov	r10, qword ptr [rsp + 72]       # 8-byte Reload
+	vpextrb	ecx, xmm3, 3
+	test	cl, 1
+	je	.LBB0_593
+# %bb.996:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 272]      # 8-byte Reload
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rcx], 3
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 4
+	test	cl, 1
+	jne	.LBB0_997
+.LBB0_594:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 5
+	test	cl, 1
+	je	.LBB0_595
+.LBB0_998:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rdx], 5
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 6
+	test	cl, 1
+	jne	.LBB0_999
+.LBB0_596:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 7
+	test	cl, 1
+	je	.LBB0_597
+.LBB0_1000:                             #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + r9], 7
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 8
+	test	cl, 1
+	jne	.LBB0_1001
+.LBB0_598:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm3, 9
+	test	cl, 1
+	je	.LBB0_600
+.LBB0_599:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + r15], 9
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_600:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	mov	rsi, qword ptr [rsp + 136]      # 8-byte Reload
+	mov	rbx, qword ptr [rsp + 128]      # 8-byte Reload
+	mov	r9, qword ptr [rsp + 120]       # 8-byte Reload
+	vpextrb	ecx, xmm3, 10
+	test	cl, 1
+	je	.LBB0_601
+# %bb.1002:                             #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rax], 10
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 11
+	test	cl, 1
+	jne	.LBB0_1003
+.LBB0_602:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 12
+	test	cl, 1
+	je	.LBB0_603
+.LBB0_1004:                             #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rax], 12
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 13
+	test	cl, 1
+	jne	.LBB0_604
+	jmp	.LBB0_605
+	.p2align	4, 0x90
+.LBB0_589:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 1
+	test	cl, 1
+	je	.LBB0_590
+.LBB0_995:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rsi], 1
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpextrb	ecx, xmm3, 2
+	test	cl, 1
+	jne	.LBB0_591
+	jmp	.LBB0_592
+	.p2align	4, 0x90
+.LBB0_593:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 4
+	test	cl, 1
+	je	.LBB0_594
+.LBB0_997:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rcx], 4
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 5
+	test	cl, 1
+	jne	.LBB0_998
+.LBB0_595:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 6
+	test	cl, 1
+	je	.LBB0_596
+.LBB0_999:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rax], 6
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 7
+	test	cl, 1
+	jne	.LBB0_1000
+.LBB0_597:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 8
+	test	cl, 1
+	je	.LBB0_598
+.LBB0_1001:                             #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rsi], 8
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpextrb	ecx, xmm3, 9
+	test	cl, 1
+	jne	.LBB0_599
+	jmp	.LBB0_600
+	.p2align	4, 0x90
+.LBB0_601:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 11
+	test	cl, 1
+	je	.LBB0_602
+.LBB0_1003:                             #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rax], 11
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+	vpextrb	ecx, xmm3, 12
+	test	cl, 1
+	jne	.LBB0_1004
+.LBB0_603:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 13
+	test	cl, 1
+	je	.LBB0_605
+.LBB0_604:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rdx], 13
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_605:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpextrb	ecx, xmm3, 14
+	test	cl, 1
+	je	.LBB0_607
+# %bb.606:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + rax], 14
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_607:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm3, 15
+	test	cl, 1
+	je	.LBB0_609
+# %bb.608:                              #   in Loop: Header=BB0_26 Depth=1
+	vpinsrb	xmm6, xmm4, byte ptr [rdi + r10], 15
+	vpblendd	ymm4, ymm4, ymm6, 15            # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+.LBB0_609:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm6, ymm3, 1
+	vmovd	eax, xmm6
+	mov	dword ptr [rsp + 512], eax      # 4-byte Spill
+	test	al, 1
+	je	.LBB0_611
+# %bb.610:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rdx], 0
+	vinserti128	ymm4, ymm4, xmm1, 1
+.LBB0_611:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpextrb	ecx, xmm6, 1
+	mov	dword ptr [rsp + 480], ecx      # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_612
+# %bb.1005:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rsi], 1
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	ecx, xmm6, 2
+	mov	dword ptr [rsp + 448], ecx      # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_1006
+.LBB0_613:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 3
+	mov	dword ptr [rsp + 416], ecx      # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_614
+.LBB0_1007:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + r9], 3
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	ecx, xmm6, 4
+	mov	dword ptr [rsp + 384], ecx      # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_1008
+.LBB0_615:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm6, 5
+	mov	dword ptr [rsp + 352], eax      # 4-byte Spill
+	test	al, 1
+	je	.LBB0_617
+.LBB0_616:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + r13], 5
+	vinserti128	ymm4, ymm4, xmm1, 1
+.LBB0_617:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	mov	rdx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpextrb	ecx, xmm6, 6
+	mov	dword ptr [rsp + 320], ecx      # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_618
+# %bb.1009:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 6
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	eax, xmm6, 7
+	mov	dword ptr [rsp + 152], eax      # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_1010
+.LBB0_619:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r9d, xmm6, 8
+	test	r9b, 1
+	je	.LBB0_620
+.LBB0_1011:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 8
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	jne	.LBB0_1012
+.LBB0_621:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r11d, xmm6, 10
+	test	r11b, 1
+	je	.LBB0_622
+.LBB0_1013:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 10
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	eax, xmm6, 11
+	test	al, 1
+	jne	.LBB0_1014
+.LBB0_623:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	esi, xmm6, 12
+	test	sil, 1
+	je	.LBB0_624
+.LBB0_1015:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rbx], 12
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	r10d, xmm6, 13
+	test	r10b, 1
+	jne	.LBB0_1016
+.LBB0_625:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpextrb	r13d, xmm6, 14
+	test	r13b, 1
+	je	.LBB0_626
+.LBB0_1017:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rdx], 14
+	vinserti128	ymm4, ymm4, xmm1, 1
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpextrb	r14d, xmm6, 15
+	test	r14b, 1
+	jne	.LBB0_627
+	jmp	.LBB0_628
+	.p2align	4, 0x90
+.LBB0_612:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 2
+	mov	dword ptr [rsp + 448], ecx      # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_613
+.LBB0_1006:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rbx], 2
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	ecx, xmm6, 3
+	mov	dword ptr [rsp + 416], ecx      # 4-byte Spill
+	test	cl, 1
+	jne	.LBB0_1007
+.LBB0_614:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 4
+	mov	dword ptr [rsp + 384], ecx      # 4-byte Spill
+	test	cl, 1
+	je	.LBB0_615
+.LBB0_1008:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 4
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	eax, xmm6, 5
+	mov	dword ptr [rsp + 352], eax      # 4-byte Spill
+	test	al, 1
+	jne	.LBB0_616
+	jmp	.LBB0_617
+	.p2align	4, 0x90
+.LBB0_618:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm6, 7
+	mov	dword ptr [rsp + 152], eax      # 4-byte Spill
+	test	al, 1
+	je	.LBB0_619
+.LBB0_1010:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 7
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	r9d, xmm6, 8
+	test	r9b, 1
+	jne	.LBB0_1011
+.LBB0_620:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ecx, xmm6, 9
+	test	cl, 1
+	je	.LBB0_621
+.LBB0_1012:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rax], 9
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	r11d, xmm6, 10
+	test	r11b, 1
+	jne	.LBB0_1013
+.LBB0_622:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	eax, xmm6, 11
+	test	al, 1
+	je	.LBB0_623
+.LBB0_1014:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	mov	rsi, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rsi], 11
+	vinserti128	ymm4, ymm4, xmm1, 1
+	vpextrb	esi, xmm6, 12
+	test	sil, 1
+	jne	.LBB0_1015
+.LBB0_624:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	r10d, xmm6, 13
+	test	r10b, 1
+	je	.LBB0_625
+.LBB0_1016:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rdx], 13
+	vinserti128	ymm4, ymm4, xmm1, 1
+	mov	rdx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpextrb	r13d, xmm6, 14
+	test	r13b, 1
+	jne	.LBB0_1017
+.LBB0_626:                              #   in Loop: Header=BB0_26 Depth=1
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpextrb	r14d, xmm6, 15
+	test	r14b, 1
+	je	.LBB0_628
+.LBB0_627:                              #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm4, 1
+	vpinsrb	xmm1, xmm1, byte ptr [rdi + rdx], 15
+	vinserti128	ymm4, ymm4, xmm1, 1
+.LBB0_628:                              #   in Loop: Header=BB0_26 Depth=1
+	vpsrlw	ymm1, ymm4, 7
+	vpand	ymm4, ymm1, ymmword ptr [rip + .LCPI0_4]
+	vmovd	r15d, xmm3
+	test	r15b, 1
+	je	.LBB0_629
+# %bb.1018:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm11
+	vpextrb	byte ptr [r8 + rbx], xmm4, 0
+	vpextrb	ebx, xmm3, 1
+	test	bl, 1
+	jne	.LBB0_1019
+.LBB0_630:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 2
+	test	bl, 1
+	je	.LBB0_631
+.LBB0_1020:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 2
+	vpextrb	ebx, xmm3, 3
+	test	bl, 1
+	jne	.LBB0_1021
+.LBB0_632:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 4
+	test	bl, 1
+	je	.LBB0_633
+.LBB0_1022:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm10
+	vpextrb	byte ptr [r8 + rbx], xmm4, 4
+	vpextrb	ebx, xmm3, 5
+	test	bl, 1
+	jne	.LBB0_1023
+.LBB0_634:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 6
+	test	bl, 1
+	je	.LBB0_635
+.LBB0_1024:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 6
+	vpextrb	ebx, xmm3, 7
+	test	bl, 1
+	jne	.LBB0_1025
+.LBB0_636:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 8
+	test	bl, 1
+	je	.LBB0_637
+.LBB0_1026:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm9
+	vpextrb	byte ptr [r8 + rbx], xmm4, 8
+	vpextrb	ebx, xmm3, 9
+	test	bl, 1
+	jne	.LBB0_1027
+.LBB0_638:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 10
+	test	bl, 1
+	je	.LBB0_639
+.LBB0_1028:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 10
+	vpextrb	ebx, xmm3, 11
+	test	bl, 1
+	jne	.LBB0_1029
+.LBB0_640:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 12
+	test	bl, 1
+	je	.LBB0_641
+.LBB0_1030:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm5
+	vpextrb	byte ptr [r8 + rbx], xmm4, 12
+	vpextrb	ebx, xmm3, 13
+	test	bl, 1
+	vmovdqa	ymm9, ymmword ptr [rsp + 896]   # 32-byte Reload
+	jne	.LBB0_1031
+.LBB0_642:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 14
+	test	bl, 1
+	je	.LBB0_643
+.LBB0_1032:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vmovq	rbx, xmm1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 14
+	vpextrb	ebx, xmm3, 15
+	test	bl, 1
+	jne	.LBB0_1033
+.LBB0_644:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 512], 1         # 1-byte Folded Reload
+	vmovdqa	ymm3, ymmword ptr [rsp + 832]   # 32-byte Reload
+	je	.LBB0_645
+.LBB0_1034:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm8
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 0
+	test	byte ptr [rsp + 480], 1         # 1-byte Folded Reload
+	jne	.LBB0_1035
+.LBB0_646:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 448], 1         # 1-byte Folded Reload
+	je	.LBB0_647
+.LBB0_1036:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 2
+	test	byte ptr [rsp + 416], 1         # 1-byte Folded Reload
+	jne	.LBB0_1037
+.LBB0_648:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 384], 1         # 1-byte Folded Reload
+	je	.LBB0_649
+.LBB0_1038:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rbx, xmm7
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 4
+	test	byte ptr [rsp + 352], 1         # 1-byte Folded Reload
+	vmovdqa	ymm8, ymmword ptr [rsp + 864]   # 32-byte Reload
+	jne	.LBB0_1039
+.LBB0_650:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 320], 1         # 1-byte Folded Reload
+	je	.LBB0_651
+.LBB0_1040:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vmovq	rbx, xmm1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 6
+	test	byte ptr [rsp + 152], 1         # 1-byte Folded Reload
+	jne	.LBB0_1041
+.LBB0_652:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r9b, 1
+	mov	r9d, dword ptr [rsp + 16]       # 4-byte Reload
+	je	.LBB0_653
+.LBB0_1042:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rdx, xmm2
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rdx], xmm1, 8
+	test	cl, 1
+	jne	.LBB0_1043
+.LBB0_654:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r11b, 1
+	mov	r11, qword ptr [rsp + 304]      # 8-byte Reload
+	je	.LBB0_655
+.LBB0_1044:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm2, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 10
+	test	al, 1
+	jne	.LBB0_1045
+.LBB0_656:                              #   in Loop: Header=BB0_26 Depth=1
+	test	sil, 1
+	je	.LBB0_657
+.LBB0_1046:                             #   in Loop: Header=BB0_26 Depth=1
+	vmovq	rcx, xmm15
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 12
+	test	r10b, 1
+	vmovdqa	ymm2, ymmword ptr [rsp + 800]   # 32-byte Reload
+	jne	.LBB0_1047
+.LBB0_658:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r13b, 1
+	mov	r10, qword ptr [rsp + 48]       # 8-byte Reload
+	je	.LBB0_659
+.LBB0_1048:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vmovq	rcx, xmm1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 14
+	test	r14b, 1
+	je	.LBB0_25
+	jmp	.LBB0_1049
+	.p2align	4, 0x90
+.LBB0_629:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 1
+	test	bl, 1
+	je	.LBB0_630
+.LBB0_1019:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm11, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 1
+	vpextrb	ebx, xmm3, 2
+	test	bl, 1
+	jne	.LBB0_1020
+.LBB0_631:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 3
+	test	bl, 1
+	je	.LBB0_632
+.LBB0_1021:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm11, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 3
+	vpextrb	ebx, xmm3, 4
+	test	bl, 1
+	jne	.LBB0_1022
+.LBB0_633:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 5
+	test	bl, 1
+	je	.LBB0_634
+.LBB0_1023:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm10, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 5
+	vpextrb	ebx, xmm3, 6
+	test	bl, 1
+	jne	.LBB0_1024
+.LBB0_635:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 7
+	test	bl, 1
+	je	.LBB0_636
+.LBB0_1025:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm10, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 7
+	vpextrb	ebx, xmm3, 8
+	test	bl, 1
+	jne	.LBB0_1026
+.LBB0_637:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 9
+	test	bl, 1
+	je	.LBB0_638
+.LBB0_1027:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm9, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 9
+	vpextrb	ebx, xmm3, 10
+	test	bl, 1
+	jne	.LBB0_1028
+.LBB0_639:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 11
+	test	bl, 1
+	je	.LBB0_640
+.LBB0_1029:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm9, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 11
+	vpextrb	ebx, xmm3, 12
+	test	bl, 1
+	jne	.LBB0_1030
+.LBB0_641:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 13
+	test	bl, 1
+	vmovdqa	ymm9, ymmword ptr [rsp + 896]   # 32-byte Reload
+	je	.LBB0_642
+.LBB0_1031:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm5, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 13
+	vpextrb	ebx, xmm3, 14
+	test	bl, 1
+	jne	.LBB0_1032
+.LBB0_643:                              #   in Loop: Header=BB0_26 Depth=1
+	vpextrb	ebx, xmm3, 15
+	test	bl, 1
+	je	.LBB0_644
+.LBB0_1033:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm5, 1
+	vpextrq	rbx, xmm1, 1
+	vpextrb	byte ptr [r8 + rbx], xmm4, 15
+	test	byte ptr [rsp + 512], 1         # 1-byte Folded Reload
+	vmovdqa	ymm3, ymmword ptr [rsp + 832]   # 32-byte Reload
+	jne	.LBB0_1034
+.LBB0_645:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 480], 1         # 1-byte Folded Reload
+	je	.LBB0_646
+.LBB0_1035:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm8, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 1
+	test	byte ptr [rsp + 448], 1         # 1-byte Folded Reload
+	jne	.LBB0_1036
+.LBB0_647:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 416], 1         # 1-byte Folded Reload
+	je	.LBB0_648
+.LBB0_1037:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm8, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 3
+	test	byte ptr [rsp + 384], 1         # 1-byte Folded Reload
+	jne	.LBB0_1038
+.LBB0_649:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 352], 1         # 1-byte Folded Reload
+	vmovdqa	ymm8, ymmword ptr [rsp + 864]   # 32-byte Reload
+	je	.LBB0_650
+.LBB0_1039:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rbx, xmm7, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 5
+	test	byte ptr [rsp + 320], 1         # 1-byte Folded Reload
+	jne	.LBB0_1040
+.LBB0_651:                              #   in Loop: Header=BB0_26 Depth=1
+	test	byte ptr [rsp + 152], 1         # 1-byte Folded Reload
+	je	.LBB0_652
+.LBB0_1041:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm7, 1
+	vpextrq	rbx, xmm1, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rbx], xmm1, 7
+	test	r9b, 1
+	mov	r9d, dword ptr [rsp + 16]       # 4-byte Reload
+	jne	.LBB0_1042
+.LBB0_653:                              #   in Loop: Header=BB0_26 Depth=1
+	test	cl, 1
+	je	.LBB0_654
+.LBB0_1043:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm2, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 9
+	test	r11b, 1
+	mov	r11, qword ptr [rsp + 304]      # 8-byte Reload
+	jne	.LBB0_1044
+.LBB0_655:                              #   in Loop: Header=BB0_26 Depth=1
+	test	al, 1
+	je	.LBB0_656
+.LBB0_1045:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm2, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 11
+	test	sil, 1
+	jne	.LBB0_1046
+.LBB0_657:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r10b, 1
+	vmovdqa	ymm2, ymmword ptr [rsp + 800]   # 32-byte Reload
+	je	.LBB0_658
+.LBB0_1047:                             #   in Loop: Header=BB0_26 Depth=1
+	vpextrq	rcx, xmm15, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 13
+	test	r13b, 1
+	mov	r10, qword ptr [rsp + 48]       # 8-byte Reload
+	jne	.LBB0_1048
+.LBB0_659:                              #   in Loop: Header=BB0_26 Depth=1
+	test	r14b, 1
+	je	.LBB0_25
+.LBB0_1049:                             #   in Loop: Header=BB0_26 Depth=1
+	vextracti128	xmm1, ymm15, 1
+	vpextrq	rcx, xmm1, 1
+	vextracti128	xmm1, ymm4, 1
+	vpextrb	byte ptr [r8 + rcx], xmm1, 15
+	jmp	.LBB0_25
+.LBB0_1050:
+	cmp	r12, r10
+	jne	.LBB0_1055
+.LBB0_1051:
+	lea	rsp, [rbp - 40]
+	pop	rbx
+	pop	r12
+	pop	r13
+	pop	r14
+	pop	r15
+	pop	rbp
+	vzeroupper
+	ret
+.LBB0_1052:
+	mov	r9d, dword ptr [rsp + 16]       # 4-byte Reload
+	mov	r10, qword ptr [rsp + 48]       # 8-byte Reload
+	jmp	.LBB0_1055
+.LBB0_1054:
+	mov	r9d, dword ptr [rsp + 16]       # 4-byte Reload
+	jmp	.LBB0_1055
+.Lfunc_end0:
+	.size	bytes_to_bools_avx2, .Lfunc_end0-bytes_to_bools_avx2
+                                        # -- End function
+	.ident	"Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
diff --git a/go/parquet/internal/utils/_lib/unpack_bool_sse4.s b/go/parquet/internal/utils/_lib/unpack_bool_sse4.s
new file mode 100644
index 00000000000..18caa0473df
--- /dev/null
+++ b/go/parquet/internal/utils/_lib/unpack_bool_sse4.s
@@ -0,0 +1,104 @@
+	.text
+	.intel_syntax noprefix
+	.file	"unpack_bool.c"
+	.globl	bytes_to_bools_sse4             # -- Begin function bytes_to_bools_sse4
+	.p2align	4, 0x90
+	.type	bytes_to_bools_sse4,@function
+bytes_to_bools_sse4:                    # @bytes_to_bools_sse4
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	test	esi, esi
+	jle	.LBB0_5
+# %bb.1:
+	mov	r8d, esi
+	shl	r8, 3
+	xor	r10d, r10d
+	jmp	.LBB0_2
+	.p2align	4, 0x90
+.LBB0_4:                                #   in Loop: Header=BB0_2 Depth=1
+	add	r10, 8
+	add	rdi, 1
+	cmp	r8, r10
+	je	.LBB0_5
+.LBB0_2:                                # =>This Inner Loop Header: Depth=1
+	cmp	r10d, ecx
+	jge	.LBB0_4
+# %bb.3:                                #   in Loop: Header=BB0_2 Depth=1
+	mov	r9d, r10d
+	movzx	eax, byte ptr [rdi]
+	and	al, 1
+	mov	byte ptr [rdx + r9], al
+	mov	rsi, r9
+	or	rsi, 1
+	cmp	esi, ecx
+	jge	.LBB0_4
+# %bb.6:                                #   in Loop: Header=BB0_2 Depth=1
+	movzx	eax, byte ptr [rdi]
+	shr	al
+	and	al, 1
+	mov	byte ptr [rdx + rsi], al
+	mov	rsi, r9
+	or	rsi, 2
+	cmp	esi, ecx
+	jge	.LBB0_4
+# %bb.7:                                #   in Loop: Header=BB0_2 Depth=1
+	movzx	eax, byte ptr [rdi]
+	shr	al, 2
+	and	al, 1
+	mov	byte ptr [rdx + rsi], al
+	mov	rsi, r9
+	or	rsi, 3
+	cmp	esi, ecx
+	jge	.LBB0_4
+# %bb.8:                                #   in Loop: Header=BB0_2 Depth=1
+	movzx	eax, byte ptr [rdi]
+	shr	al, 3
+	and	al, 1
+	mov	byte ptr [rdx + rsi], al
+	mov	rsi, r9
+	or	rsi, 4
+	cmp	esi, ecx
+	jge	.LBB0_4
+# %bb.9:                                #   in Loop: Header=BB0_2 Depth=1
+	movzx	eax, byte ptr [rdi]
+	shr	al, 4
+	and	al, 1
+	mov	byte ptr [rdx + rsi], al
+	mov	rsi, r9
+	or	rsi, 5
+	cmp	esi, ecx
+	jge	.LBB0_4
+# %bb.10:                               #   in Loop: Header=BB0_2 Depth=1
+	movzx	eax, byte ptr [rdi]
+	shr	al, 5
+	and	al, 1
+	mov	byte ptr [rdx + rsi], al
+	mov	rsi, r9
+	or	rsi, 6
+	cmp	esi, ecx
+	jge	.LBB0_4
+# %bb.11:                               #   in Loop: Header=BB0_2 Depth=1
+	movzx	eax, byte ptr [rdi]
+	shr	al, 6
+	and	al, 1
+	mov	byte ptr [rdx + rsi], al
+	or	r9, 7
+	cmp	r9d, ecx
+	jge	.LBB0_4
+# %bb.12:                               #   in Loop: Header=BB0_2 Depth=1
+	movzx	eax, byte ptr [rdi]
+	shr	al, 7
+	mov	byte ptr [rdx + r9], al
+	jmp	.LBB0_4
+.LBB0_5:
+	mov	rsp, rbp
+	pop	rbp
+	ret
+.Lfunc_end0:
+	.size	bytes_to_bools_sse4, .Lfunc_end0-bytes_to_bools_sse4
+                                        # -- End function
+	.ident	"Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
diff --git a/go/parquet/internal/utils/bit_benchmark_test.go b/go/parquet/internal/utils/bit_benchmark_test.go
new file mode 100644
index 00000000000..d91ab55c6d7
--- /dev/null
+++ b/go/parquet/internal/utils/bit_benchmark_test.go
@@ -0,0 +1,220 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+	"math/rand"
+	"strconv"
+	"testing"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/parquet/internal/testutils"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+)
+
+const bufferSize = 1024 * 8
+
+// a naive bitmap reader for a baseline
+
+type NaiveBitmapReader struct {
+	bitmap []byte
+	pos    int
+}
+
+func (n *NaiveBitmapReader) IsSet() bool    { return bitutil.BitIsSet(n.bitmap, n.pos) }
+func (n *NaiveBitmapReader) IsNotSet() bool { return !n.IsSet() }
+func (n *NaiveBitmapReader) Next()          { n.pos++ }
+
+// naive bitmap writer for a baseline
+
+type NaiveBitmapWriter struct {
+	bitmap []byte
+	pos    int
+}
+
+func (n *NaiveBitmapWriter) Set() {
+	byteOffset := n.pos / 8
+	bitOffset := n.pos % 8
+	bitSetMask := uint8(1 << bitOffset)
+	n.bitmap[byteOffset] |= bitSetMask
+}
+
+func (n *NaiveBitmapWriter) Clear() {
+	byteOffset := n.pos / 8
+	bitOffset := n.pos % 8
+	bitClearMask := uint8(0xFF ^ (1 << bitOffset))
+	n.bitmap[byteOffset] &= bitClearMask
+}
+
+func (n *NaiveBitmapWriter) Next()   { n.pos++ }
+func (n *NaiveBitmapWriter) Finish() {}
+
+func randomBuffer(nbytes int64) []byte {
+	buf := make([]byte, nbytes)
+	r := rand.New(rand.NewSource(0))
+	r.Read(buf)
+	return buf
+}
+
+func randomBitsBuffer(nbits, setPct int64) []byte {
+	rag := testutils.NewRandomArrayGenerator(23)
+	prob := float64(0)
+	if setPct != -1 {
+		prob = float64(setPct) / 100.0
+	}
+	buf := make([]byte, int(bitutil.BytesForBits(nbits)))
+	rag.GenerateBitmap(buf, nbits, prob)
+
+	if setPct == -1 {
+		wr := utils.NewBitmapWriter(buf, 0, nbits)
+		for i := int64(0); i < nbits; i++ {
+			if i%2 == 0 {
+				wr.Set()
+			} else {
+				wr.Clear()
+			}
+			wr.Next()
+		}
+	}
+	return buf
+}
+
+func BenchmarkBitmapReader(b *testing.B) {
+	buf := randomBuffer(bufferSize)
+	nbits := bufferSize * 8
+
+	b.Run("naive baseline", func(b *testing.B) {
+		b.SetBytes(2 * bufferSize)
+		for i := 0; i < b.N; i++ {
+			{
+				total := 0
+				rdr := NaiveBitmapReader{buf, 0}
+				for j := 0; j < nbits; j++ {
+					if rdr.IsSet() {
+						total++
+					}
+					rdr.Next()
+				}
+			}
+			{
+				total := 0
+				rdr := NaiveBitmapReader{buf, 0}
+				for j := 0; j < nbits; j++ {
+					if rdr.IsSet() {
+						total++
+					}
+					rdr.Next()
+				}
+			}
+		}
+	})
+	b.Run("bitmap reader", func(b *testing.B) {
+		b.SetBytes(2 * bufferSize)
+		for i := 0; i < b.N; i++ {
+			{
+				total := 0
+				rdr := utils.NewBitmapReader(buf, 0, int64(nbits))
+				for j := 0; j < nbits; j++ {
+					if rdr.Set() {
+						total++
+					}
+					rdr.Next()
+				}
+			}
+			{
+				total := 0
+				rdr := utils.NewBitmapReader(buf, 0, int64(nbits))
+				for j := 0; j < nbits; j++ {
+					if rdr.Set() {
+						total++
+					}
+					rdr.Next()
+				}
+			}
+		}
+	})
+}
+
+func testBitRunReader(rdr utils.BitRunReader) (setTotal int64) {
+	for {
+		br := rdr.NextRun()
+		if br.Len == 0 {
+			break
+		}
+		if br.Set {
+			setTotal += br.Len
+		}
+	}
+	return
+}
+
+func BenchmarkBitRunReader(b *testing.B) {
+	const numBits = 4096
+	for _, pct := range []int64{1, 0, 10, 25, 50, 60, 75, 99} {
+		buf := randomBitsBuffer(numBits, pct)
+		b.Run("set pct "+strconv.Itoa(int(pct)), func(b *testing.B) {
+			b.Run("linear", func(b *testing.B) {
+				b.SetBytes(numBits / 8)
+				for i := 0; i < b.N; i++ {
+					rdr := linearBitRunReader{utils.NewBitmapReader(buf, 0, numBits)}
+					testBitRunReader(rdr)
+				}
+			})
+			b.Run("internal", func(b *testing.B) {
+				b.SetBytes(numBits / 8)
+				for i := 0; i < b.N; i++ {
+					rdr := utils.NewBitRunReader(buf, 0, numBits)
+					testBitRunReader(rdr)
+				}
+			})
+		})
+	}
+}
+
+func testSetBitRunReader(rdr utils.SetBitRunReader) (setTotal int64) {
+	for {
+		br := rdr.NextRun()
+		if br.Length == 0 {
+			break
+		}
+		setTotal += br.Length
+	}
+	return
+}
+
+func BenchmarkSetBitRunReader(b *testing.B) {
+	const numBits = 4096
+	for _, pct := range []int64{1, 0, 10, 25, 50, 60, 75, 99} {
+		buf := randomBitsBuffer(numBits, pct)
+		b.Run("set pct "+strconv.Itoa(int(pct)), func(b *testing.B) {
+			b.Run("reader", func(b *testing.B) {
+				b.SetBytes(numBits / 8)
+				for i := 0; i < b.N; i++ {
+					rdr := utils.NewSetBitRunReader(buf, 0, numBits)
+					testSetBitRunReader(rdr)
+				}
+			})
+			b.Run("reverse rdr", func(b *testing.B) {
+				b.SetBytes(numBits / 8)
+				for i := 0; i < b.N; i++ {
+					rdr := utils.NewReverseSetBitRunReader(buf, 0, numBits)
+					testSetBitRunReader(rdr)
+				}
+			})
+		})
+	}
+}
diff --git a/go/parquet/internal/utils/bit_block_counter.go b/go/parquet/internal/utils/bit_block_counter.go
new file mode 100644
index 00000000000..119171a2648
--- /dev/null
+++ b/go/parquet/internal/utils/bit_block_counter.go
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"math"
+	"math/bits"
+	"unsafe"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/arrow/endian"
+)
+
+var toLEFunc func(uint64) uint64
+
+func init() {
+	if endian.IsBigEndian {
+		toLEFunc = bits.ReverseBytes64
+	} else {
+		toLEFunc = func(in uint64) uint64 { return in }
+	}
+}
+
+func loadWord(byt []byte) uint64 {
+	return toLEFunc(*(*uint64)(unsafe.Pointer(&byt[0])))
+}
+
+func shiftWord(current, next uint64, shift int64) uint64 {
+	if shift == 0 {
+		return current
+	}
+	return (current >> shift) | (next << (64 - shift))
+}
+
+// BitBlockCount is returned by the various bit block counter utilities
+// in order to return a length of bits and the population count of that
+// slice of bits.
+type BitBlockCount struct {
+	Len    int16
+	Popcnt int16
+}
+
+// NoneSet returns true if ALL the bits were 0 in this set, ie: Popcnt == 0
+func (b BitBlockCount) NoneSet() bool {
+	return b.Popcnt == 0
+}
+
+// AllSet returns true if ALL the bits were 1 in this set, ie: Popcnt == Len
+func (b BitBlockCount) AllSet() bool {
+	return b.Len == b.Popcnt
+}
+
+// BitBlockCounter is a utility for grabbing chunks of a bitmap at a time and efficiently
+// counting the number of bits which are 1.
+type BitBlockCounter struct {
+	bitmap        []byte
+	bitsRemaining int64
+	bitOffset     int8
+}
+
+const (
+	wordBits      int64 = 64
+	fourWordsBits int64 = wordBits * 4
+)
+
+// NewBitBlockCounter returns a BitBlockCounter for the passed bitmap starting at startOffset
+// of length nbits.
+func NewBitBlockCounter(bitmap []byte, startOffset, nbits int64) *BitBlockCounter {
+	return &BitBlockCounter{
+		bitmap:        bitmap[startOffset/8:],
+		bitsRemaining: nbits,
+		bitOffset:     int8(startOffset % 8),
+	}
+}
+
+// getBlockSlow is for returning a block of the requested size when there aren't
+// enough bits remaining to do a full word computation.
+func (b *BitBlockCounter) getBlockSlow(blockSize int64) BitBlockCount {
+	runlen := int16(Min(b.bitsRemaining, blockSize))
+	popcnt := int16(bitutil.CountSetBits(b.bitmap, int(b.bitOffset), int(runlen)))
+	b.bitsRemaining -= int64(runlen)
+	b.bitmap = b.bitmap[runlen/8:]
+	return BitBlockCount{runlen, popcnt}
+}
+
+// NextFourWords returns the next run of available bits, usually 256. The
+// returned pair contains the size of run and the number of true values.
+// The last block will have a length less than 256 if the bitmap length
+// is not a multiple of 256, and will return 0-length blocks in subsequent
+// invocations.
+func (b *BitBlockCounter) NextFourWords() BitBlockCount {
+	if b.bitsRemaining == 0 {
+		return BitBlockCount{0, 0}
+	}
+
+	totalPopcnt := 0
+	if b.bitOffset == 0 {
+		// if we're aligned at 0 bitoffset, then we can easily just jump from
+		// word to word nice and easy.
+		if b.bitsRemaining < fourWordsBits {
+			return b.getBlockSlow(fourWordsBits)
+		}
+		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap))
+		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[8:]))
+		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[16:]))
+		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[24:]))
+	} else {
+		// When the offset is > 0, we need there to be a word beyond the last
+		// aligned word in the bitmap for the bit shifting logic.
+		if b.bitsRemaining < 5*fourWordsBits-int64(b.bitOffset) {
+			return b.getBlockSlow(fourWordsBits)
+		}
+
+		current := loadWord(b.bitmap)
+		next := loadWord(b.bitmap[8:])
+		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+
+		current = next
+		next = loadWord(b.bitmap[16:])
+		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+
+		current = next
+		next = loadWord(b.bitmap[24:])
+		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+
+		current = next
+		next = loadWord(b.bitmap[32:])
+		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+	}
+	b.bitmap = b.bitmap[bitutil.BytesForBits(fourWordsBits):]
+	b.bitsRemaining -= fourWordsBits
+	return BitBlockCount{256, int16(totalPopcnt)}
+}
+
+// NextWord returns the next run of available bits, usually 64. The returned
+// pair contains the size of run and the number of true values. The last
+// block will have a length less than 64 if the bitmap length is not a
+// multiple of 64, and will return 0-length blocks in subsequent
+// invocations.
+func (b *BitBlockCounter) NextWord() BitBlockCount {
+	if b.bitsRemaining == 0 {
+		return BitBlockCount{0, 0}
+	}
+	popcnt := 0
+	if b.bitOffset == 0 {
+		if b.bitsRemaining < wordBits {
+			return b.getBlockSlow(wordBits)
+		}
+		popcnt = bits.OnesCount64(loadWord(b.bitmap))
+	} else {
+		// When the offset is > 0, we need there to be a word beyond the last
+		// aligned word in the bitmap for the bit shifting logic.
+		if b.bitsRemaining < (2*wordBits - int64(b.bitOffset)) {
+			return b.getBlockSlow(wordBits)
+		}
+		popcnt = bits.OnesCount64(shiftWord(loadWord(b.bitmap), loadWord(b.bitmap[8:]), int64(b.bitOffset)))
+	}
+	b.bitmap = b.bitmap[wordBits/8:]
+	b.bitsRemaining -= wordBits
+	return BitBlockCount{64, int16(popcnt)}
+}
+
+// OptionalBitBlockCounter is a useful counter to iterate through a possibly
+// non-existent validity bitmap to allow us to write one code path for both
+// the with-nulls and no-nulls cases without giving up a lot of performance.
+type OptionalBitBlockCounter struct {
+	hasBitmap bool
+	pos       int64
+	len       int64
+	counter   *BitBlockCounter
+}
+
+// NewOptionalBitBlockCounter constructs and returns a new bit block counter that
+// can properly handle the case when a bitmap is null, if it is guaranteed that the
+// the bitmap is not nil, then prefer NewBitBlockCounter here.
+func NewOptionalBitBlockCounter(bitmap []byte, offset, length int64) *OptionalBitBlockCounter {
+	var counter *BitBlockCounter
+	if bitmap != nil {
+		counter = NewBitBlockCounter(bitmap, offset, length)
+	}
+	return &OptionalBitBlockCounter{
+		hasBitmap: bitmap != nil,
+		pos:       0,
+		len:       length,
+		counter:   counter,
+	}
+}
+
+// NextBlock returns block count for next word when the bitmap is available otherwise
+// return a block with length up to INT16_MAX when there is no validity
+// bitmap (so all the referenced values are not null).
+func (obc *OptionalBitBlockCounter) NextBlock() BitBlockCount {
+	const maxBlockSize = math.MaxInt16
+	if obc.hasBitmap {
+		block := obc.counter.NextWord()
+		obc.pos += int64(block.Len)
+		return block
+	}
+
+	blockSize := int16(Min(maxBlockSize, obc.len-obc.pos))
+	obc.pos += int64(blockSize)
+	// all values are non-null
+	return BitBlockCount{blockSize, blockSize}
+}
+
+// NextWord is like NextBlock, but returns a word-sized block even when there is no
+// validity bitmap
+func (obc *OptionalBitBlockCounter) NextWord() BitBlockCount {
+	const wordsize = 64
+	if obc.hasBitmap {
+		block := obc.counter.NextWord()
+		obc.pos += int64(block.Len)
+		return block
+	}
+	blockSize := int16(Min(wordsize, obc.len-obc.pos))
+	obc.pos += int64(blockSize)
+	// all values are non-null
+	return BitBlockCount{blockSize, blockSize}
+}
+
+// VisitBitBlocks is a utility for easily iterating through the blocks of bits in a bitmap,
+// calling the appropriate visitValid/visitInvalid function as we iterate through the bits.
+// visitValid is called with the bitoffset of the valid bit. Don't use this inside a tight
+// loop when performance is needed and instead prefer manually constructing these loops
+// in that scenario.
+func VisitBitBlocks(bitmap []byte, offset, length int64, visitValid func(pos int64), visitInvalid func()) {
+	counter := NewOptionalBitBlockCounter(bitmap, offset, length)
+	pos := int64(0)
+	for pos < length {
+		block := counter.NextBlock()
+		if block.AllSet() {
+			for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+				visitValid(pos)
+			}
+		} else if block.NoneSet() {
+			for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+				visitInvalid()
+			}
+		} else {
+			for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+				if bitutil.BitIsSet(bitmap, int(offset+pos)) {
+					visitValid(pos)
+				} else {
+					visitInvalid()
+				}
+			}
+		}
+	}
+}
diff --git a/go/parquet/internal/utils/bit_block_counter_test.go b/go/parquet/internal/utils/bit_block_counter_test.go
new file mode 100644
index 00000000000..6ec5a87d50d
--- /dev/null
+++ b/go/parquet/internal/utils/bit_block_counter_test.go
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+	"testing"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/stretchr/testify/assert"
+	"golang.org/x/exp/rand"
+)
+
+const kWordSize = 64
+
+func create(nbytes, offset, length int64) (*memory.Buffer, *utils.BitBlockCounter) {
+	buf := memory.NewResizableBuffer(memory.DefaultAllocator)
+	buf.Resize(int(nbytes))
+	return buf, utils.NewBitBlockCounter(buf.Bytes(), offset, length)
+}
+
+func TestOneWordBasics(t *testing.T) {
+	const nbytes = 1024
+
+	buf, counter := create(nbytes, 0, nbytes*8)
+	defer buf.Release()
+
+	var bitsScanned int64
+	for i := 0; i < nbytes/8; i++ {
+		block := counter.NextWord()
+		assert.EqualValues(t, kWordSize, block.Len)
+		assert.EqualValues(t, 0, block.Popcnt)
+		bitsScanned += int64(block.Len)
+	}
+	assert.EqualValues(t, 1024*8, bitsScanned)
+
+	block := counter.NextWord()
+	assert.Zero(t, block.Len)
+	assert.Zero(t, block.Popcnt)
+	assert.True(t, block.NoneSet())
+}
+
+func TestFourWordsBasics(t *testing.T) {
+	const nbytes = 1024
+
+	buf, counter := create(nbytes, 0, nbytes*8)
+	defer buf.Release()
+
+	var bitsScanned int64
+	for i := 0; i < nbytes/32; i++ {
+		block := counter.NextFourWords()
+		assert.EqualValues(t, 4*kWordSize, block.Len)
+		assert.EqualValues(t, 0, block.Popcnt)
+		bitsScanned += int64(block.Len)
+	}
+	assert.EqualValues(t, 1024*8, bitsScanned)
+
+	block := counter.NextFourWords()
+	assert.Zero(t, block.Len)
+	assert.Zero(t, block.Popcnt)
+}
+
+func TestOneWordWithOffsets(t *testing.T) {
+	checkWithOffset := func(offset int64) {
+		const (
+			nwords     int64 = 4
+			totalBytes       = nwords*8 + 1
+		)
+
+		// Trim a bit from the end of the bitmap so we can check
+		// the remainder bits behavior
+		buf, counter := create(totalBytes, offset, nwords*kWordSize-offset-1)
+		defer buf.Release()
+
+		memory.Set(buf.Bytes(), byte(0xFF))
+
+		block := counter.NextWord()
+		assert.EqualValues(t, kWordSize, block.Len)
+		assert.EqualValues(t, 64, block.Popcnt)
+
+		// add a false value to the next word
+		bitutil.SetBitTo(buf.Bytes(), kWordSize+int(offset), false)
+		block = counter.NextWord()
+		assert.EqualValues(t, 64, block.Len)
+		assert.EqualValues(t, 63, block.Popcnt)
+
+		// Set the next word to all false
+		utils.SetBitsTo(buf.Bytes(), 2*kWordSize+offset, kWordSize, false)
+
+		block = counter.NextWord()
+		assert.EqualValues(t, 64, block.Len)
+		assert.Zero(t, block.Popcnt)
+
+		block = counter.NextWord()
+		assert.EqualValues(t, kWordSize-offset-1, block.Len)
+		assert.EqualValues(t, block.Len, block.Popcnt)
+		assert.True(t, block.AllSet())
+
+		// we can keep calling nextword safely
+		block = counter.NextWord()
+		assert.Zero(t, block.Len)
+		assert.Zero(t, block.Popcnt)
+	}
+
+	for offsetI := int64(0); offsetI < 8; offsetI++ {
+		checkWithOffset(offsetI)
+	}
+}
+
+func TestFourWordsWithOffsets(t *testing.T) {
+	checkWithOffset := func(offset int64) {
+		const (
+			nwords     = 17
+			totalBytes = nwords*8 + 1
+		)
+
+		// trim a bit from the end of the bitmap so we can check the remainder
+		// bits behavior
+		buf, counter := create(totalBytes, offset, nwords*kWordSize-offset-1)
+
+		// start with all set
+		memory.Set(buf.Bytes(), 0xFF)
+
+		block := counter.NextFourWords()
+		assert.EqualValues(t, 4*kWordSize, block.Len)
+		assert.EqualValues(t, block.Len, block.Popcnt)
+
+		// add some false values to the next 3 shifted words
+		bitutil.ClearBit(buf.Bytes(), int(4*kWordSize+offset))
+		bitutil.ClearBit(buf.Bytes(), int(5*kWordSize+offset))
+		bitutil.ClearBit(buf.Bytes(), int(6*kWordSize+offset))
+
+		block = counter.NextFourWords()
+		assert.EqualValues(t, 4*kWordSize, block.Len)
+		assert.EqualValues(t, 253, block.Popcnt)
+
+		// set the next two words to all false
+		utils.SetBitsTo(buf.Bytes(), 8*kWordSize+offset, 2*kWordSize, false)
+
+		// block is half set
+		block = counter.NextFourWords()
+		assert.EqualValues(t, 4*kWordSize, block.Len)
+		assert.EqualValues(t, 128, block.Popcnt)
+
+		// last full block whether offset or no
+		block = counter.NextFourWords()
+		assert.EqualValues(t, 4*kWordSize, block.Len)
+		assert.EqualValues(t, block.Len, block.Popcnt)
+
+		// partial block
+		block = counter.NextFourWords()
+		assert.EqualValues(t, kWordSize-offset-1, block.Len)
+		assert.EqualValues(t, block.Len, block.Popcnt)
+
+		// we can keep calling NextFourWords safely
+		block = counter.NextFourWords()
+		assert.Zero(t, block.Len)
+		assert.Zero(t, block.Popcnt)
+	}
+
+	for offsetI := int64(0); offsetI < 8; offsetI++ {
+		checkWithOffset(offsetI)
+	}
+}
+
+func TestFourWordsRandomData(t *testing.T) {
+	const (
+		nbytes = 1024
+	)
+
+	buf := make([]byte, nbytes)
+	r := rand.New(rand.NewSource(0))
+	r.Read(buf)
+
+	checkWithOffset := func(offset int64) {
+		counter := utils.NewBitBlockCounter(buf, offset, nbytes*8-offset)
+		for i := 0; i < nbytes/32; i++ {
+			block := counter.NextFourWords()
+			assert.EqualValues(t, bitutil.CountSetBits(buf, i*256+int(offset), int(block.Len)), block.Popcnt)
+		}
+	}
+
+	for offsetI := int64(0); offsetI < 8; offsetI++ {
+		checkWithOffset(offsetI)
+	}
+}
diff --git a/go/parquet/internal/utils/bit_packing.go b/go/parquet/internal/utils/bit_packing.go
new file mode 100644
index 00000000000..dbbb5159c11
--- /dev/null
+++ b/go/parquet/internal/utils/bit_packing.go
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import (
+	"io"
+
+	"golang.org/x/sys/cpu"
+)
+
+var unpack32 func(io.Reader, []uint32, int) int
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		unpack32 = unpack32Avx2
+	} else { // default to the pure go implementation if no avx2 available
+		unpack32 = unpack32Default
+	}
+}
diff --git a/go/parquet/internal/utils/bit_packing_avx2.go b/go/parquet/internal/utils/bit_packing_avx2.go
new file mode 100644
index 00000000000..ee01f002b5e
--- /dev/null
+++ b/go/parquet/internal/utils/bit_packing_avx2.go
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import (
+	"bytes"
+	"io"
+	"sync"
+	"unsafe"
+)
+
+var bufferPool = sync.Pool{New: func() interface{} { return &bytes.Buffer{} }}
+
+//go:noescape
+func _unpack32_avx2(in, out unsafe.Pointer, batchSize, nbits int) (num int)
+
+func unpack32Avx2(in io.Reader, out []uint32, nbits int) int {
+	batch := len(out) / 32 * 32
+	if batch <= 0 {
+		return 0
+	}
+
+	n := batch * nbits / 8
+
+	buffer := bufferPool.Get().(*bytes.Buffer)
+	defer bufferPool.Put(buffer)
+	buffer.Reset()
+	buffer.Grow(n)
+	io.CopyN(buffer, in, int64(n))
+
+	var (
+		input  = unsafe.Pointer(&buffer.Bytes()[0])
+		output = unsafe.Pointer(&out[0])
+	)
+
+	return _unpack32_avx2(input, output, len(out), nbits)
+}
diff --git a/go/parquet/internal/utils/bit_packing_avx2.s b/go/parquet/internal/utils/bit_packing_avx2.s
new file mode 100644
index 00000000000..8a678160951
--- /dev/null
+++ b/go/parquet/internal/utils/bit_packing_avx2.s
@@ -0,0 +1,3439 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+DATA LCDATA1<>+0x000(SB)/8, $0x7fffffff7fffffff
+DATA LCDATA1<>+0x008(SB)/8, $0x3fffffff3fffffff
+DATA LCDATA1<>+0x010(SB)/8, $0x1fffffff1fffffff
+DATA LCDATA1<>+0x018(SB)/8, $0x0fffffff0fffffff
+DATA LCDATA1<>+0x020(SB)/8, $0x07ffffff07ffffff
+DATA LCDATA1<>+0x028(SB)/8, $0x03ffffff03ffffff
+DATA LCDATA1<>+0x030(SB)/8, $0x0000000a00000010
+DATA LCDATA1<>+0x038(SB)/8, $0x0000001600000010
+DATA LCDATA1<>+0x040(SB)/8, $0x01ffffff01ffffff
+DATA LCDATA1<>+0x048(SB)/8, $0x007fffff007fffff
+DATA LCDATA1<>+0x050(SB)/8, $0x003fffff003fffff
+DATA LCDATA1<>+0x058(SB)/8, $0x001fffff001fffff
+DATA LCDATA1<>+0x060(SB)/8, $0x000fffff000fffff
+DATA LCDATA1<>+0x068(SB)/8, $0x0007ffff0007ffff
+DATA LCDATA1<>+0x070(SB)/8, $0x0003ffff0003ffff
+DATA LCDATA1<>+0x078(SB)/8, $0x0001ffff0001ffff
+DATA LCDATA1<>+0x080(SB)/8, $0x0000001000000000
+DATA LCDATA1<>+0x088(SB)/8, $0x00007fff00007fff
+DATA LCDATA1<>+0x090(SB)/8, $0x00003fff00003fff
+DATA LCDATA1<>+0x098(SB)/8, $0x00001fff00001fff
+DATA LCDATA1<>+0x0a0(SB)/8, $0x00000fff00000fff
+DATA LCDATA1<>+0x0a8(SB)/8, $0x000007ff000007ff
+DATA LCDATA1<>+0x0b0(SB)/8, $0x000003ff000003ff
+DATA LCDATA1<>+0x0b8(SB)/8, $0x000001ff000001ff
+DATA LCDATA1<>+0x0c0(SB)/8, $0x0000007f0000007f
+DATA LCDATA1<>+0x0c8(SB)/8, $0x0000003f0000003f
+DATA LCDATA1<>+0x0d0(SB)/8, $0x0000001f0000001f
+DATA LCDATA1<>+0x0d8(SB)/8, $0x0000000f0000000f
+DATA LCDATA1<>+0x0e0(SB)/8, $0x0000000700000007
+DATA LCDATA1<>+0x0e8(SB)/8, $0x0000000300000003
+DATA LCDATA1<>+0x0f0(SB)/8, $0x0000000100000001
+DATA LCDATA1<>+0x0f8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x100(SB)/8, $0x0000001700000018
+DATA LCDATA1<>+0x108(SB)/8, $0x0000001500000016
+DATA LCDATA1<>+0x110(SB)/8, $0x0000001300000014
+DATA LCDATA1<>+0x118(SB)/8, $0x0000001100000012
+DATA LCDATA1<>+0x120(SB)/8, $0x0000000900000008
+DATA LCDATA1<>+0x128(SB)/8, $0x0000000b0000000a
+DATA LCDATA1<>+0x130(SB)/8, $0x0000000d0000000c
+DATA LCDATA1<>+0x138(SB)/8, $0x0000000f0000000e
+DATA LCDATA1<>+0x140(SB)/8, $0x0000000f00000010
+DATA LCDATA1<>+0x148(SB)/8, $0x0000000d0000000e
+DATA LCDATA1<>+0x150(SB)/8, $0x0000000b0000000c
+DATA LCDATA1<>+0x158(SB)/8, $0x000000090000000a
+DATA LCDATA1<>+0x160(SB)/8, $0x0000001100000010
+DATA LCDATA1<>+0x168(SB)/8, $0x0000001300000012
+DATA LCDATA1<>+0x170(SB)/8, $0x0000001500000014
+DATA LCDATA1<>+0x178(SB)/8, $0x0000001700000016
+DATA LCDATA1<>+0x180(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x188(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x190(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x198(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x1a0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1a8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1b0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1b8(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0x1c0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1c8(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x1d0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1d8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1e0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1e8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x1f0(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x1f8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x200(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x208(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x210(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x218(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x220(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x228(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x230(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x238(SB)/8, $0x0000000400000000
+DATA LCDATA1<>+0x240(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x248(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x250(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x258(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x260(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x268(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x270(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x278(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x280(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x288(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x290(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x298(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2a0(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x2a8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2b0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2b8(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x2c0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2c8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2d0(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0x2d8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2e0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2e8(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x2f0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x2f8(SB)/8, $0x0000000600000000
+DATA LCDATA1<>+0x300(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x308(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x310(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x318(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x320(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x328(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x330(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x338(SB)/8, $0x0000000000000006
+DATA LCDATA1<>+0x340(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x348(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x350(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x358(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x360(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x368(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x370(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x378(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x380(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x388(SB)/8, $0x0000000000000006
+DATA LCDATA1<>+0x390(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x398(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x3a0(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x3a8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x3b0(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x3b8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x3c0(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x3c8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x3d0(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x3d8(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x3e0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x3e8(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0x3f0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x3f8(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x400(SB)/8, $0x0000000600000000
+DATA LCDATA1<>+0x408(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x410(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x418(SB)/8, $0x0000000a00000000
+DATA LCDATA1<>+0x420(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x428(SB)/8, $0x000000000000000a
+DATA LCDATA1<>+0x430(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x438(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x440(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x448(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x450(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x458(SB)/8, $0x0000000000000006
+DATA LCDATA1<>+0x460(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x468(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x470(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x478(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x480(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x488(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x490(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x498(SB)/8, $0x0000000b00000000
+DATA LCDATA1<>+0x4a0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x4a8(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x4b0(SB)/8, $0x0000000400000000
+DATA LCDATA1<>+0x4b8(SB)/8, $0x0000000c00000000
+DATA LCDATA1<>+0x4c0(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x4c8(SB)/8, $0x0000000000000006
+DATA LCDATA1<>+0x4d0(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x4d8(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x4e0(SB)/8, $0x0000000b00000000
+DATA LCDATA1<>+0x4e8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x4f0(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x4f8(SB)/8, $0x000000000000000a
+DATA LCDATA1<>+0x500(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x508(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x510(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x518(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x520(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x528(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x530(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x538(SB)/8, $0x0000000d00000000
+DATA LCDATA1<>+0x540(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x548(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x550(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x558(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x560(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0x568(SB)/8, $0x0000000600000000
+DATA LCDATA1<>+0x570(SB)/8, $0x0000000a00000000
+DATA LCDATA1<>+0x578(SB)/8, $0x0000000e00000000
+DATA LCDATA1<>+0x580(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0x588(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x590(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x598(SB)/8, $0x0000000000000006
+DATA LCDATA1<>+0x5a0(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x5a8(SB)/8, $0x000000000000000a
+DATA LCDATA1<>+0x5b0(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x5b8(SB)/8, $0x000000000000000e
+DATA LCDATA1<>+0x5c0(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x5c8(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x5d0(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x5d8(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x5e0(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x5e8(SB)/8, $0x0000000b00000000
+DATA LCDATA1<>+0x5f0(SB)/8, $0x0000000d00000000
+DATA LCDATA1<>+0x5f8(SB)/8, $0x0000000f00000000
+DATA LCDATA1<>+0x600(SB)/8, $0x0000000f00000000
+DATA LCDATA1<>+0x608(SB)/8, $0x0000000d00000000
+DATA LCDATA1<>+0x610(SB)/8, $0x0000000b00000000
+DATA LCDATA1<>+0x618(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x620(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x628(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x630(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x638(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x640(SB)/8, $0x0000000000000010
+DATA LCDATA1<>+0x648(SB)/8, $0x000000000000000e
+DATA LCDATA1<>+0x650(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x658(SB)/8, $0x000000000000000a
+DATA LCDATA1<>+0x660(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x668(SB)/8, $0x0000000000000006
+DATA LCDATA1<>+0x670(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x678(SB)/8, $0x0000001100000002
+DATA LCDATA1<>+0x680(SB)/8, $0x0000000e00000000
+DATA LCDATA1<>+0x688(SB)/8, $0x0000000a00000000
+DATA LCDATA1<>+0x690(SB)/8, $0x0000000600000000
+DATA LCDATA1<>+0x698(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0x6a0(SB)/8, $0x0000000000000010
+DATA LCDATA1<>+0x6a8(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x6b0(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x6b8(SB)/8, $0x0000001200000004
+DATA LCDATA1<>+0x6c0(SB)/8, $0x0000000d00000000
+DATA LCDATA1<>+0x6c8(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x6d0(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x6d8(SB)/8, $0x000000000000000e
+DATA LCDATA1<>+0x6e0(SB)/8, $0x0000000000000008
+DATA LCDATA1<>+0x6e8(SB)/8, $0x0000000f00000002
+DATA LCDATA1<>+0x6f0(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x6f8(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x700(SB)/8, $0x0000000000000010
+DATA LCDATA1<>+0x708(SB)/8, $0x000000000000000a
+DATA LCDATA1<>+0x710(SB)/8, $0x0000001100000004
+DATA LCDATA1<>+0x718(SB)/8, $0x0000000b00000000
+DATA LCDATA1<>+0x720(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x728(SB)/8, $0x0000000000000012
+DATA LCDATA1<>+0x730(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x738(SB)/8, $0x0000001300000006
+DATA LCDATA1<>+0x740(SB)/8, $0x0000000c00000000
+DATA LCDATA1<>+0x748(SB)/8, $0x0000000400000000
+DATA LCDATA1<>+0x750(SB)/8, $0x0000000000000010
+DATA LCDATA1<>+0x758(SB)/8, $0x0000001400000008
+DATA LCDATA1<>+0x760(SB)/8, $0x0000000b00000000
+DATA LCDATA1<>+0x768(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x770(SB)/8, $0x000000000000000c
+DATA LCDATA1<>+0x778(SB)/8, $0x0000000d00000002
+DATA LCDATA1<>+0x780(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x788(SB)/8, $0x000000000000000e
+DATA LCDATA1<>+0x790(SB)/8, $0x0000000f00000004
+DATA LCDATA1<>+0x798(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x7a0(SB)/8, $0x0000000000000010
+DATA LCDATA1<>+0x7a8(SB)/8, $0x0000001100000006
+DATA LCDATA1<>+0x7b0(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x7b8(SB)/8, $0x0000000000000012
+DATA LCDATA1<>+0x7c0(SB)/8, $0x0000001300000008
+DATA LCDATA1<>+0x7c8(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x7d0(SB)/8, $0x0000000000000014
+DATA LCDATA1<>+0x7d8(SB)/8, $0x000000150000000a
+DATA LCDATA1<>+0x7e0(SB)/8, $0x0000000a00000000
+DATA LCDATA1<>+0x7e8(SB)/8, $0x0000000000000014
+DATA LCDATA1<>+0x7f0(SB)/8, $0x0000001200000008
+DATA LCDATA1<>+0x7f8(SB)/8, $0x0000000600000000
+DATA LCDATA1<>+0x800(SB)/8, $0x0000000000000010
+DATA LCDATA1<>+0x808(SB)/8, $0x0000000e00000004
+DATA LCDATA1<>+0x810(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0x818(SB)/8, $0x000000160000000c
+DATA LCDATA1<>+0x820(SB)/8, $0x0000000900000000
+DATA LCDATA1<>+0x828(SB)/8, $0x0000000000000012
+DATA LCDATA1<>+0x830(SB)/8, $0x0000000d00000004
+DATA LCDATA1<>+0x838(SB)/8, $0x0000000000000016
+DATA LCDATA1<>+0x840(SB)/8, $0x0000001100000008
+DATA LCDATA1<>+0x848(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x850(SB)/8, $0x000000150000000c
+DATA LCDATA1<>+0x858(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x860(SB)/8, $0x0000000000000010
+DATA LCDATA1<>+0x868(SB)/8, $0x0000000b00000002
+DATA LCDATA1<>+0x870(SB)/8, $0x0000000000000014
+DATA LCDATA1<>+0x878(SB)/8, $0x0000000f00000006
+DATA LCDATA1<>+0x880(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x888(SB)/8, $0x000000130000000a
+DATA LCDATA1<>+0x890(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x898(SB)/8, $0x000000170000000e
+DATA LCDATA1<>+0x8a0(SB)/8, $0x0000000700000000
+DATA LCDATA1<>+0x8a8(SB)/8, $0x000000150000000e
+DATA LCDATA1<>+0x8b0(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x8b8(SB)/8, $0x000000110000000a
+DATA LCDATA1<>+0x8c0(SB)/8, $0x0000000000000018
+DATA LCDATA1<>+0x8c8(SB)/8, $0x0000000d00000006
+DATA LCDATA1<>+0x8d0(SB)/8, $0x0000000000000014
+DATA LCDATA1<>+0x8d8(SB)/8, $0x0000000900000002
+DATA LCDATA1<>+0x8e0(SB)/8, $0x0000001700000010
+DATA LCDATA1<>+0x8e8(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x8f0(SB)/8, $0x000000130000000c
+DATA LCDATA1<>+0x8f8(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x900(SB)/8, $0x0000000f00000008
+DATA LCDATA1<>+0x908(SB)/8, $0x0000000000000016
+DATA LCDATA1<>+0x910(SB)/8, $0x0000000b00000004
+DATA LCDATA1<>+0x918(SB)/8, $0x0000001900000012
+DATA LCDATA1<>+0x920(SB)/8, $0x0000000600000000
+DATA LCDATA1<>+0x928(SB)/8, $0x000000120000000c
+DATA LCDATA1<>+0x930(SB)/8, $0x0000000000000018
+DATA LCDATA1<>+0x938(SB)/8, $0x0000000a00000004
+DATA LCDATA1<>+0x940(SB)/8, $0x0000001600000010
+DATA LCDATA1<>+0x948(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0x950(SB)/8, $0x0000000e00000008
+DATA LCDATA1<>+0x958(SB)/8, $0x0000001a00000014
+DATA LCDATA1<>+0x960(SB)/8, $0x0000000500000000
+DATA LCDATA1<>+0x968(SB)/8, $0x0000000f0000000a
+DATA LCDATA1<>+0x970(SB)/8, $0x0000001900000014
+DATA LCDATA1<>+0x978(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0x980(SB)/8, $0x0000000d00000008
+DATA LCDATA1<>+0x988(SB)/8, $0x0000001700000012
+DATA LCDATA1<>+0x990(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x998(SB)/8, $0x0000000b00000006
+DATA LCDATA1<>+0x9a0(SB)/8, $0x0000001500000010
+DATA LCDATA1<>+0x9a8(SB)/8, $0x000000000000001a
+DATA LCDATA1<>+0x9b0(SB)/8, $0x0000000900000004
+DATA LCDATA1<>+0x9b8(SB)/8, $0x000000130000000e
+DATA LCDATA1<>+0x9c0(SB)/8, $0x0000000000000018
+DATA LCDATA1<>+0x9c8(SB)/8, $0x0000000700000002
+DATA LCDATA1<>+0x9d0(SB)/8, $0x000000110000000c
+DATA LCDATA1<>+0x9d8(SB)/8, $0x0000001b00000016
+DATA LCDATA1<>+0x9e0(SB)/8, $0x0000000400000000
+DATA LCDATA1<>+0x9e8(SB)/8, $0x0000000c00000008
+DATA LCDATA1<>+0x9f0(SB)/8, $0x0000001400000010
+DATA LCDATA1<>+0x9f8(SB)/8, $0x0000001c00000018
+DATA LCDATA1<>+0xa00(SB)/8, $0x0000000300000000
+DATA LCDATA1<>+0xa08(SB)/8, $0x0000000900000006
+DATA LCDATA1<>+0xa10(SB)/8, $0x0000000f0000000c
+DATA LCDATA1<>+0xa18(SB)/8, $0x0000001500000012
+DATA LCDATA1<>+0xa20(SB)/8, $0x0000001b00000018
+DATA LCDATA1<>+0xa28(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0xa30(SB)/8, $0x0000000700000004
+DATA LCDATA1<>+0xa38(SB)/8, $0x0000000d0000000a
+DATA LCDATA1<>+0xa40(SB)/8, $0x0000001300000010
+DATA LCDATA1<>+0xa48(SB)/8, $0x0000001900000016
+DATA LCDATA1<>+0xa50(SB)/8, $0x000000000000001c
+DATA LCDATA1<>+0xa58(SB)/8, $0x0000000500000002
+DATA LCDATA1<>+0xa60(SB)/8, $0x0000000b00000008
+DATA LCDATA1<>+0xa68(SB)/8, $0x000000110000000e
+DATA LCDATA1<>+0xa70(SB)/8, $0x0000001700000014
+DATA LCDATA1<>+0xa78(SB)/8, $0x0000001d0000001a
+DATA LCDATA1<>+0xa80(SB)/8, $0x0000000200000000
+DATA LCDATA1<>+0xa88(SB)/8, $0x0000000600000004
+DATA LCDATA1<>+0xa90(SB)/8, $0x0000000a00000008
+DATA LCDATA1<>+0xa98(SB)/8, $0x0000000e0000000c
+DATA LCDATA1<>+0xaa0(SB)/8, $0x0000001200000010
+DATA LCDATA1<>+0xaa8(SB)/8, $0x0000001600000014
+DATA LCDATA1<>+0xab0(SB)/8, $0x0000001a00000018
+DATA LCDATA1<>+0xab8(SB)/8, $0x0000001e0000001c
+DATA LCDATA1<>+0xac0(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0xac8(SB)/8, $0x0000000300000002
+DATA LCDATA1<>+0xad0(SB)/8, $0x0000000500000004
+DATA LCDATA1<>+0xad8(SB)/8, $0x0000000700000006
+DATA LCDATA1<>+0xae0(SB)/8, $0x0000001900000018
+DATA LCDATA1<>+0xae8(SB)/8, $0x0000001b0000001a
+DATA LCDATA1<>+0xaf0(SB)/8, $0x0000001d0000001c
+DATA LCDATA1<>+0xaf8(SB)/8, $0x0000001f0000001e
+DATA LCDATA1<>+0xb00(SB)/8, $0x0000000700000008
+DATA LCDATA1<>+0xb08(SB)/8, $0x0000000500000006
+DATA LCDATA1<>+0xb10(SB)/8, $0x0000001900000018
+DATA LCDATA1<>+0xb18(SB)/8, $0x0000001b0000001a
+DATA LCDATA1<>+0xb20(SB)/8, $0x0000000e00000010
+DATA LCDATA1<>+0xb28(SB)/8, $0x0000000a0000000c
+DATA LCDATA1<>+0xb30(SB)/8, $0x0000001200000010
+DATA LCDATA1<>+0xb38(SB)/8, $0x0000001600000014
+DATA LCDATA1<>+0xb40(SB)/8, $0x0000000500000008
+DATA LCDATA1<>+0xb48(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xb50(SB)/8, $0x0000001b00000018
+DATA LCDATA1<>+0xb58(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xb60(SB)/8, $0x0000000d00000010
+DATA LCDATA1<>+0xb68(SB)/8, $0x000000070000000a
+DATA LCDATA1<>+0xb70(SB)/8, $0x0000001300000010
+DATA LCDATA1<>+0xb78(SB)/8, $0x0000001900000016
+DATA LCDATA1<>+0xb80(SB)/8, $0x0000001500000018
+DATA LCDATA1<>+0xb88(SB)/8, $0x0000000f00000012
+DATA LCDATA1<>+0xb90(SB)/8, $0x0000000b00000008
+DATA LCDATA1<>+0xb98(SB)/8, $0x000000110000000e
+DATA LCDATA1<>+0xba0(SB)/8, $0x0000001300000018
+DATA LCDATA1<>+0xba8(SB)/8, $0x000000090000000e
+DATA LCDATA1<>+0xbb0(SB)/8, $0x0000000d00000008
+DATA LCDATA1<>+0xbb8(SB)/8, $0x0000001700000012
+DATA LCDATA1<>+0xbc0(SB)/8, $0x0000000b00000010
+DATA LCDATA1<>+0xbc8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xbd0(SB)/8, $0x0000001500000010
+DATA LCDATA1<>+0xbd8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xbe0(SB)/8, $0x0000000900000010
+DATA LCDATA1<>+0xbe8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xbf0(SB)/8, $0x0000001700000010
+DATA LCDATA1<>+0xbf8(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc00(SB)/8, $0x0000001100000018
+DATA LCDATA1<>+0xc08(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc10(SB)/8, $0x0000000f00000008
+DATA LCDATA1<>+0xc18(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc20(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc28(SB)/8, $0x0000000800000000
+DATA LCDATA1<>+0xc30(SB)/8, $0x0000000f00000018
+DATA LCDATA1<>+0xc38(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc40(SB)/8, $0x0000001100000008
+DATA LCDATA1<>+0xc48(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc50(SB)/8, $0x0000000d00000018
+DATA LCDATA1<>+0xc58(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc60(SB)/8, $0x0000001300000008
+DATA LCDATA1<>+0xc68(SB)/8, $0x0000000000000000
+DATA LCDATA1<>+0xc70(SB)/8, $0x0000000800000000
+DATA LCDATA1<>+0xc78(SB)/8, $0x0000001800000010
+DATA LCDATA1<>+0xc80(SB)/8, $0x000000ff00ffffff
+GLOBL LCDATA1<>(SB), 8, $3208
+
+TEXT ·_unpack32_avx2(SB), $24-40
+
+	MOVQ in+0(FP), DI
+	MOVQ out+8(FP), SI
+	MOVQ batchSize+16(FP), DX
+	MOVQ nbits+24(FP), CX
+	MOVQ SP, BP
+	ADDQ $16, SP
+	ANDQ $-16, SP
+	MOVQ BP, 0(SP)
+	LEAQ LCDATA1<>(SB), BP
+
+	WORD $0x8949; BYTE $0xf7 // mov    r15, rsi
+	WORD $0x8948; BYTE $0xfb // mov    rbx, rdi
+	LONG $0x1f728d44         // lea    r14d, [rdx + 31]
+	WORD $0xd285             // test    edx, edx
+	LONG $0xf2490f44         // cmovns    r14d, edx
+	LONG $0x05fec141         // sar    r14d, 5
+	WORD $0xf983; BYTE $0x0f // cmp    ecx, 15
+	JLE  LBB0_1
+	WORD $0xf983; BYTE $0x17 // cmp    ecx, 23
+	JLE  LBB0_49
+	WORD $0xf983; BYTE $0x1b // cmp    ecx, 27
+	JLE  LBB0_73
+	WORD $0xf983; BYTE $0x1d // cmp    ecx, 29
+	JLE  LBB0_85
+	WORD $0xf983; BYTE $0x1e // cmp    ecx, 30
+	JE   LBB0_99
+	WORD $0xf983; BYTE $0x1f // cmp    ecx, 31
+	JE   LBB0_96
+	WORD $0xf983; BYTE $0x20 // cmp    ecx, 32
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20 // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf4 // mov    r12d, r14d
+
+LBB0_95:
+	LONG $0x000080ba; BYTE $0x00 // mov    edx, 128
+	WORD $0x894c; BYTE $0xff     // mov    rdi, r15
+	WORD $0x8948; BYTE $0xde     // mov    rsi, rbx
+	CALL clib·_memcpy(SB)
+	LONG $0x80eb8348             // sub    rbx, -128
+	LONG $0x80ef8349             // sub    r15, -128
+	LONG $0xffc48349             // add    r12, -1
+	JNE  LBB0_95
+	JMP  LBB0_147
+
+LBB0_1:
+	WORD $0xf983; BYTE $0x07             // cmp    ecx, 7
+	JG   LBB0_25
+	WORD $0xf983; BYTE $0x03             // cmp    ecx, 3
+	JG   LBB0_14
+	WORD $0xf983; BYTE $0x01             // cmp    ecx, 1
+	JG   LBB0_9
+	WORD $0xc985                         // test    ecx, ecx
+	JE   LBB0_144
+	WORD $0xf983; BYTE $0x01             // cmp    ecx, 1
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	WORD $0xc931                         // xor    ecx, ecx
+	QUAD $0x0000f085597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 240[rbp] /* [rip + .LCPI0_135] */
+	QUAD $0x00000ac08d6ffdc5             // vmovdqa    ymm1, yword 2752[rbp] /* [rip + .LCPI0_134] */
+	QUAD $0x00000120956ffdc5             // vmovdqa    ymm2, yword 288[rbp] /* [rip + .LCPI0_2] */
+	QUAD $0x000001609d6ffdc5             // vmovdqa    ymm3, yword 352[rbp] /* [rip + .LCPI0_4] */
+	QUAD $0x00000ae0a56ffdc5             // vmovdqa    ymm4, yword 2784[rbp] /* [rip + .LCPI0_136] */
+
+LBB0_8:
+	LONG $0x587de2c4; WORD $0x8b2c // vpbroadcastd    ymm5, dword [rbx + 4*rcx]
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	LONG $0x587de2c4; WORD $0x8b2c // vpbroadcastd    ymm5, dword [rbx + 4*rcx]
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	LONG $0x587de2c4; WORD $0x8b2c // vpbroadcastd    ymm5, dword [rbx + 4*rcx]
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	LONG $0x587de2c4; WORD $0x8b2c // vpbroadcastd    ymm5, dword [rbx + 4*rcx]
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x01c18348               // add    rcx, 1
+	LONG $0x80ef8349               // sub    r15, -128
+	WORD $0x3948; BYTE $0xc8       // cmp    rax, rcx
+	JNE  LBB0_8
+	JMP  LBB0_147
+
+LBB0_49:
+	WORD $0xf983; BYTE $0x13       // cmp    ecx, 19
+	JG   LBB0_61
+	WORD $0xf983; BYTE $0x11       // cmp    ecx, 17
+	JG   LBB0_56
+	WORD $0xf983; BYTE $0x10       // cmp    ecx, 16
+	JE   LBB0_120
+	WORD $0xf983; BYTE $0x11       // cmp    ecx, 17
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x40c38348               // add    rbx, 64
+	LONG $0x597de2c4; WORD $0x7845 // vpbroadcastq    ymm0, qword 120[rbp] /* [rip + .LCPI0_76] */
+	QUAD $0x000005808d6ffdc5       // vmovdqa    ymm1, yword 1408[rbp] /* [rip + .LCPI0_75] */
+	QUAD $0x000005a0956ffdc5       // vmovdqa    ymm2, yword 1440[rbp] /* [rip + .LCPI0_77] */
+	QUAD $0x000005c09d6ffdc5       // vmovdqa    ymm3, yword 1472[rbp] /* [rip + .LCPI0_78] */
+	QUAD $0x000005e0a56ffdc5       // vmovdqa    ymm4, yword 1504[rbp] /* [rip + .LCPI0_79] */
+
+LBB0_55:
+	WORD $0x4b8b; BYTE $0xcc       // mov    ecx, dword [rbx - 52]
+	LONG $0xd0538b44               // mov    r10d, dword [rbx - 48]
+	LONG $0xcaa40f41; BYTE $0x09   // shld    r10d, ecx, 9
+	WORD $0x738b; BYTE $0xc8       // mov    esi, dword [rbx - 56]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x0bf7a40f               // shld    edi, esi, 11
+	LONG $0xc04b8b44               // mov    r9d, dword [rbx - 64]
+	WORD $0x538b; BYTE $0xc4       // mov    edx, dword [rbx - 60]
+	WORD $0xd089                   // mov    eax, edx
+	LONG $0xc8a40f44; BYTE $0x0f   // shld    eax, r9d, 15
+	LONG $0xee6ef9c5               // vmovd    xmm5, esi
+	LONG $0x0dd6a40f               // shld    esi, edx, 13
+	LONG $0x2251e3c4; WORD $0x01ef // vpinsrd    xmm5, xmm5, edi, 1
+	LONG $0x2251e3c4; WORD $0x02e9 // vpinsrd    xmm5, xmm5, ecx, 2
+	LONG $0x2251c3c4; WORD $0x03ea // vpinsrd    xmm5, xmm5, r10d, 3
+	LONG $0x6e79c1c4; BYTE $0xf1   // vmovd    xmm6, r9d
+	LONG $0x2249e3c4; WORD $0x01f0 // vpinsrd    xmm6, xmm6, eax, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249e3c4; WORD $0x03f6 // vpinsrd    xmm6, xmm6, esi, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	WORD $0x438b; BYTE $0xdc       // mov    eax, dword [rbx - 36]
+	LONG $0xe0538b44               // mov    r10d, dword [rbx - 32]
+	LONG $0xc2a40f41; BYTE $0x01   // shld    r10d, eax, 1
+	WORD $0x538b; BYTE $0xd8       // mov    edx, dword [rbx - 40]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x03d6a40f               // shld    esi, edx, 3
+	LONG $0xd04b8b44               // mov    r9d, dword [rbx - 48]
+	WORD $0x4b8b; BYTE $0xd4       // mov    ecx, dword [rbx - 44]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0xcfa40f44; BYTE $0x07   // shld    edi, r9d, 7
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x05caa40f               // shld    edx, ecx, 5
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251c3c4; WORD $0x03ea // vpinsrd    xmm5, xmm5, r10d, 3
+	LONG $0x6e79c1c4; BYTE $0xf1   // vmovd    xmm6, r9d
+	LONG $0x2249e3c4; WORD $0x01f7 // vpinsrd    xmm6, xmm6, edi, 1
+	LONG $0x2249e3c4; WORD $0x02f1 // vpinsrd    xmm6, xmm6, ecx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	LONG $0xf04b8b44               // mov    r9d, dword [rbx - 16]
+	LONG $0xec5b8b44               // mov    r11d, dword [rbx - 20]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x0a   // shld    edx, r11d, 10
+	LONG $0xe8538b44               // mov    r10d, dword [rbx - 24]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0xd7a40f44; BYTE $0x0c   // shld    edi, r10d, 12
+	WORD $0x438b; BYTE $0xe4       // mov    eax, dword [rbx - 28]
+	WORD $0x8944; BYTE $0xd6       // mov    esi, r10d
+	LONG $0x0ec6a40f               // shld    esi, eax, 14
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	LONG $0x10c1ac0f               // shrd    ecx, eax, 16
+	LONG $0xef6ef9c5               // vmovd    xmm5, edi
+	LONG $0x2251c3c4; WORD $0x01eb // vpinsrd    xmm5, xmm5, r11d, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f0 // vpinsrd    xmm6, xmm6, eax, 1
+	LONG $0x2249e3c4; WORD $0x02f6 // vpinsrd    xmm6, xmm6, esi, 2
+	LONG $0x2249c3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, r10d, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	LONG $0xfc5b8b44               // mov    r11d, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x02   // shld    edx, r11d, 2
+	LONG $0xf8538b44               // mov    r10d, dword [rbx - 8]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0xd7a40f44; BYTE $0x04   // shld    edi, r10d, 4
+	WORD $0x438b; BYTE $0xf0       // mov    eax, dword [rbx - 16]
+	WORD $0x738b; BYTE $0xf4       // mov    esi, dword [rbx - 12]
+	WORD $0x8944; BYTE $0xd1       // mov    ecx, r10d
+	LONG $0x06f1a40f               // shld    ecx, esi, 6
+	LONG $0x18f0ac0f               // shrd    eax, esi, 24
+	LONG $0xef6ef9c5               // vmovd    xmm5, edi
+	LONG $0x2251c3c4; WORD $0x01eb // vpinsrd    xmm5, xmm5, r11d, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0xf06ef9c5               // vmovd    xmm6, eax
+	LONG $0x2249e3c4; WORD $0x01f6 // vpinsrd    xmm6, xmm6, esi, 1
+	LONG $0x2249e3c4; WORD $0x02f1 // vpinsrd    xmm6, xmm6, ecx, 2
+	LONG $0x2249c3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, r10d, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x44c38348               // add    rbx, 68
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_55
+	JMP  LBB0_147
+
+LBB0_25:
+	WORD $0xf983; BYTE $0x0b             // cmp    ecx, 11
+	JG   LBB0_37
+	WORD $0xf983; BYTE $0x09             // cmp    ecx, 9
+	JG   LBB0_32
+	WORD $0xf983; BYTE $0x08             // cmp    ecx, 8
+	JE   LBB0_132
+	WORD $0xf983; BYTE $0x09             // cmp    ecx, 9
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x20c38348                     // add    rbx, 32
+	QUAD $0x0000b885597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 184[rbp] /* [rip + .LCPI0_105] */
+	QUAD $0x000008208d6ffdc5             // vmovdqa    ymm1, yword 2080[rbp] /* [rip + .LCPI0_104] */
+	QUAD $0x00000840956ffdc5             // vmovdqa    ymm2, yword 2112[rbp] /* [rip + .LCPI0_106] */
+	QUAD $0x000008609d6ffdc5             // vmovdqa    ymm3, yword 2144[rbp] /* [rip + .LCPI0_107] */
+	QUAD $0x00000880a56ffdc5             // vmovdqa    ymm4, yword 2176[rbp] /* [rip + .LCPI0_108] */
+
+LBB0_31:
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	WORD $0x538b; BYTE $0xe4       // mov    edx, dword [rbx - 28]
+	WORD $0x738b; BYTE $0xe8       // mov    esi, dword [rbx - 24]
+	LONG $0x01d6a40f               // shld    esi, edx, 1
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x2251e3c4; WORD $0x01ea // vpinsrd    xmm5, xmm5, edx, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x05caa40f               // shld    edx, ecx, 5
+	LONG $0x2251e3c4; WORD $0x03ee // vpinsrd    xmm5, xmm5, esi, 3
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f1 // vpinsrd    xmm6, xmm6, ecx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	WORD $0x4b8b; BYTE $0xf0       // mov    ecx, dword [rbx - 16]
+	WORD $0x538b; BYTE $0xe8       // mov    edx, dword [rbx - 24]
+	WORD $0x738b; BYTE $0xec       // mov    esi, dword [rbx - 20]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x02f7a40f               // shld    edi, esi, 2
+	WORD $0xf089                   // mov    eax, esi
+	LONG $0x06d0a40f               // shld    eax, edx, 6
+	LONG $0xee6ef9c5               // vmovd    xmm5, esi
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2251e3c4; WORD $0x02ef // vpinsrd    xmm5, xmm5, edi, 2
+	LONG $0x2251e3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, ecx, 3
+	LONG $0xf26ef9c5               // vmovd    xmm6, edx
+	LONG $0x2249e3c4; WORD $0x01f2 // vpinsrd    xmm6, xmm6, edx, 1
+	LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd    xmm6, xmm6, eax, 2
+	LONG $0x2249e3c4; WORD $0x03f6 // vpinsrd    xmm6, xmm6, esi, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	WORD $0x438b; BYTE $0xf8       // mov    eax, dword [rbx - 8]
+	WORD $0x4b8b; BYTE $0xf0       // mov    ecx, dword [rbx - 16]
+	WORD $0x538b; BYTE $0xf4       // mov    edx, dword [rbx - 12]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x03d6a40f               // shld    esi, edx, 3
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x07cfa40f               // shld    edi, ecx, 7
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f7 // vpinsrd    xmm6, xmm6, edi, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x038b                   // mov    eax, dword [rbx]
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	WORD $0x538b; BYTE $0xfc       // mov    edx, dword [rbx - 4]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x04d6a40f               // shld    esi, edx, 4
+	LONG $0x18d1ac0f               // shrd    ecx, edx, 24
+	LONG $0xee6ef9c5               // vmovd    xmm5, esi
+	LONG $0x2251e3c4; WORD $0x01e8 // vpinsrd    xmm5, xmm5, eax, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f2 // vpinsrd    xmm6, xmm6, edx, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x24c38348               // add    rbx, 36
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_31
+	JMP  LBB0_147
+
+LBB0_73:
+	WORD $0xf983; BYTE $0x19       // cmp    ecx, 25
+	JG   LBB0_79
+	WORD $0xf983; BYTE $0x18       // cmp    ecx, 24
+	JE   LBB0_108
+	WORD $0xf983; BYTE $0x19       // cmp    ecx, 25
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x60c38348               // add    rbx, 96
+	LONG $0x597de2c4; WORD $0x4045 // vpbroadcastq    ymm0, qword 64[rbp] /* [rip + .LCPI0_38] */
+	QUAD $0x000002608d6f7dc5       // vmovdqa    ymm9, yword 608[rbp] /* [rip + .LCPI0_28] */
+	QUAD $0x00000300956f7dc5       // vmovdqa    ymm10, yword 768[rbp] /* [rip + .LCPI0_39] */
+	QUAD $0x00000be09d6f79c5       // vmovdqa    xmm11, oword 3040[rbp] /* [rip + .LCPI0_40] */
+	QUAD $0x00000bf0a56ff9c5       // vmovdqa    xmm4, oword 3056[rbp] /* [rip + .LCPI0_41] */
+	QUAD $0x00000320ad6ffdc5       // vmovdqa    ymm5, yword 800[rbp] /* [rip + .LCPI0_42] */
+	QUAD $0x00000c00b56ff9c5       // vmovdqa    xmm6, oword 3072[rbp] /* [rip + .LCPI0_43] */
+	QUAD $0x00000c10bd6ff9c5       // vmovdqa    xmm7, oword 3088[rbp] /* [rip + .LCPI0_44] */
+	QUAD $0x00000340856f7dc5       // vmovdqa    ymm8, yword 832[rbp] /* [rip + .LCPI0_45] */
+
+LBB0_78:
+	WORD $0x4b8b; BYTE $0xb4       // mov    ecx, dword [rbx - 76]
+	LONG $0xb84b8b44               // mov    r9d, dword [rbx - 72]
+	LONG $0xc9a40f41; BYTE $0x11   // shld    r9d, ecx, 17
+	WORD $0x738b; BYTE $0xb0       // mov    esi, dword [rbx - 80]
+	LONG $0x0af1a40f               // shld    ecx, esi, 10
+	WORD $0x7b8b; BYTE $0xac       // mov    edi, dword [rbx - 84]
+	LONG $0x03fea40f               // shld    esi, edi, 3
+	WORD $0x438b; BYTE $0xa8       // mov    eax, dword [rbx - 88]
+	LONG $0xcf6ef9c5               // vmovd    xmm1, edi
+	LONG $0x15c7a40f               // shld    edi, eax, 21
+	LONG $0xa0538b44               // mov    r10d, dword [rbx - 96]
+	WORD $0x538b; BYTE $0xa4       // mov    edx, dword [rbx - 92]
+	LONG $0x0ed0a40f               // shld    eax, edx, 14
+	LONG $0xd2a40f44; BYTE $0x07   // shld    edx, r10d, 7
+	LONG $0x2271e3c4; WORD $0x01ce // vpinsrd    xmm1, xmm1, esi, 1
+	LONG $0x6e79c1c4; BYTE $0xd2   // vmovd    xmm2, r10d
+	LONG $0x2271e3c4; WORD $0x02c9 // vpinsrd    xmm1, xmm1, ecx, 2
+	LONG $0x2269e3c4; WORD $0x01d2 // vpinsrd    xmm2, xmm2, edx, 1
+	LONG $0x2271c3c4; WORD $0x03c9 // vpinsrd    xmm1, xmm1, r9d, 3
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2269e3c4; WORD $0x03d7 // vpinsrd    xmm2, xmm2, edi, 3
+	LONG $0x386de3c4; WORD $0x01c9 // vinserti128    ymm1, ymm2, xmm1, 1
+	LONG $0x4575c2c4; BYTE $0xc9   // vpsrlvd    ymm1, ymm1, ymm9
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa04f // vmovdqu    yword [r15 - 96], ymm1
+	LONG $0xcc5b8b44               // mov    r11d, dword [rbx - 52]
+	LONG $0xd04b8b44               // mov    r9d, dword [rbx - 48]
+	LONG $0xd9a40f45; BYTE $0x09   // shld    r9d, r11d, 9
+	LONG $0xc8538b44               // mov    r10d, dword [rbx - 56]
+	LONG $0xd3a40f45; BYTE $0x02   // shld    r11d, r10d, 2
+	WORD $0x738b; BYTE $0xc4       // mov    esi, dword [rbx - 60]
+	WORD $0x8944; BYTE $0xd7       // mov    edi, r10d
+	WORD $0x4b8b; BYTE $0xc0       // mov    ecx, dword [rbx - 64]
+	LONG $0x14f7a40f               // shld    edi, esi, 20
+	WORD $0x538b; BYTE $0xb8       // mov    edx, dword [rbx - 72]
+	WORD $0x438b; BYTE $0xbc       // mov    eax, dword [rbx - 68]
+	LONG $0x0dcea40f               // shld    esi, ecx, 13
+	LONG $0x08c2ac0f               // shrd    edx, eax, 8
+	LONG $0x06c1a40f               // shld    ecx, eax, 6
+	LONG $0xcf6ef9c5               // vmovd    xmm1, edi
+	LONG $0x2271c3c4; WORD $0x01ca // vpinsrd    xmm1, xmm1, r10d, 1
+	LONG $0xd26ef9c5               // vmovd    xmm2, edx
+	LONG $0x2271c3c4; WORD $0x02cb // vpinsrd    xmm1, xmm1, r11d, 2
+	LONG $0x2269e3c4; WORD $0x01d0 // vpinsrd    xmm2, xmm2, eax, 1
+	LONG $0x2271c3c4; WORD $0x03c9 // vpinsrd    xmm1, xmm1, r9d, 3
+	LONG $0x2269e3c4; WORD $0x02d1 // vpinsrd    xmm2, xmm2, ecx, 2
+	LONG $0x2269e3c4; WORD $0x03d6 // vpinsrd    xmm2, xmm2, esi, 3
+	LONG $0x386de3c4; WORD $0x01c9 // vinserti128    ymm1, ymm2, xmm1, 1
+	LONG $0x4575c2c4; BYTE $0xca   // vpsrlvd    ymm1, ymm1, ymm10
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc04f // vmovdqu    yword [r15 - 64], ymm1
+	WORD $0x438b; BYTE $0xe4       // mov    eax, dword [rbx - 28]
+	LONG $0xe84b8b44               // mov    r9d, dword [rbx - 24]
+	LONG $0xc1a40f41; BYTE $0x01   // shld    r9d, eax, 1
+	WORD $0x538b; BYTE $0xe0       // mov    edx, dword [rbx - 32]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x13d6a40f               // shld    esi, edx, 19
+	WORD $0x7b8b; BYTE $0xd8       // mov    edi, dword [rbx - 40]
+	WORD $0x4b8b; BYTE $0xdc       // mov    ecx, dword [rbx - 36]
+	LONG $0x0ccaa40f               // shld    edx, ecx, 12
+	LONG $0x05f9a40f               // shld    ecx, edi, 5
+	LONG $0x4b7efac5; BYTE $0xd0   // vmovq    xmm1, qword [rbx - 48]
+	LONG $0x4571c2c4; BYTE $0xd3   // vpsrlvd    xmm2, xmm1, xmm11
+	LONG $0xc970f9c5; BYTE $0xe5   // vpshufd    xmm1, xmm1, 229
+	LONG $0x2271e3c4; WORD $0x01cf // vpinsrd    xmm1, xmm1, edi, 1
+	LONG $0x4771e2c4; BYTE $0xcc   // vpsllvd    xmm1, xmm1, xmm4
+	LONG $0xc9ebe9c5               // vpor    xmm1, xmm2, xmm1
+	LONG $0xd26ef9c5               // vmovd    xmm2, edx
+	LONG $0x2269e3c4; WORD $0x01d6 // vpinsrd    xmm2, xmm2, esi, 1
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x2271e3c4; WORD $0x02cf // vpinsrd    xmm1, xmm1, edi, 2
+	LONG $0x2271e3c4; WORD $0x03c9 // vpinsrd    xmm1, xmm1, ecx, 3
+	LONG $0x3875e3c4; WORD $0x01ca // vinserti128    ymm1, ymm1, xmm2, 1
+	LONG $0x4575e2c4; BYTE $0xcd   // vpsrlvd    ymm1, ymm1, ymm5
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe04f // vmovdqu    yword [r15 - 32], ymm1
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x12caa40f               // shld    edx, ecx, 18
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0x0bf1a40f               // shld    ecx, esi, 11
+	LONG $0xf0538b44               // mov    r10d, dword [rbx - 16]
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	LONG $0x04fea40f               // shld    esi, edi, 4
+	WORD $0xf889                   // mov    eax, edi
+	LONG $0xd0a40f44; BYTE $0x16   // shld    eax, r10d, 22
+	LONG $0x4b7efac5; BYTE $0xe8   // vmovq    xmm1, qword [rbx - 24]
+	LONG $0x4571e2c4; BYTE $0xd6   // vpsrlvd    xmm2, xmm1, xmm6
+	LONG $0xc970f9c5; BYTE $0xe5   // vpshufd    xmm1, xmm1, 229
+	LONG $0x2271c3c4; WORD $0x01ca // vpinsrd    xmm1, xmm1, r10d, 1
+	LONG $0x4771e2c4; BYTE $0xcf   // vpsllvd    xmm1, xmm1, xmm7
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0xc9ebe9c5               // vpor    xmm1, xmm2, xmm1
+	LONG $0x2261e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm3, edx, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x2271e3c4; WORD $0x02c8 // vpinsrd    xmm1, xmm1, eax, 2
+	LONG $0x2271e3c4; WORD $0x03cf // vpinsrd    xmm1, xmm1, edi, 3
+	LONG $0x3875e3c4; WORD $0x01ca // vinserti128    ymm1, ymm1, xmm2, 1
+	LONG $0x4575c2c4; BYTE $0xc8   // vpsrlvd    ymm1, ymm1, ymm8
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x0f   // vmovdqu    yword [r15], ymm1
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x64c38348               // add    rbx, 100
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_78
+	JMP  LBB0_147
+
+LBB0_14:
+	WORD $0xf983; BYTE $0x05             // cmp    ecx, 5
+	JG   LBB0_20
+	WORD $0xf983; BYTE $0x04             // cmp    ecx, 4
+	JE   LBB0_138
+	WORD $0xf983; BYTE $0x05             // cmp    ecx, 5
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x10c38348                     // add    rbx, 16
+	QUAD $0x0000d085597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 208[rbp] /* [rip + .LCPI0_120] */
+	QUAD $0x000009608d6ffdc5             // vmovdqa    ymm1, yword 2400[rbp] /* [rip + .LCPI0_119] */
+	QUAD $0x00000980956ffdc5             // vmovdqa    ymm2, yword 2432[rbp] /* [rip + .LCPI0_121] */
+	QUAD $0x000009a09d6ffdc5             // vmovdqa    ymm3, yword 2464[rbp] /* [rip + .LCPI0_122] */
+	QUAD $0x000009c0a56ffdc5             // vmovdqa    ymm4, yword 2496[rbp] /* [rip + .LCPI0_123] */
+
+LBB0_19:
+	WORD $0x4b8b; BYTE $0xf0       // mov    ecx, dword [rbx - 16]
+	WORD $0x538b; BYTE $0xf4       // mov    edx, dword [rbx - 12]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x02cea40f               // shld    esi, ecx, 2
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x5879e2c4; BYTE $0xf5   // vpbroadcastd    xmm6, xmm5
+	LONG $0x2251e3c4; WORD $0x01e9 // vpinsrd    xmm5, xmm5, ecx, 1
+	LONG $0x2251e3c4; WORD $0x02ee // vpinsrd    xmm5, xmm5, esi, 2
+	LONG $0x2251e3c4; WORD $0x03ea // vpinsrd    xmm5, xmm5, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	WORD $0x4b8b; BYTE $0xf4       // mov    ecx, dword [rbx - 12]
+	WORD $0x538b; BYTE $0xf8       // mov    edx, dword [rbx - 8]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x5879e2c4; BYTE $0xed   // vpbroadcastd    xmm5, xmm5
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f2 // vpinsrd    xmm6, xmm6, edx, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	WORD $0x538b; BYTE $0xfc       // mov    edx, dword [rbx - 4]
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x01caa40f               // shld    edx, ecx, 1
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f1 // vpinsrd    xmm6, xmm6, ecx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x5879e2c4; BYTE $0xed   // vpbroadcastd    xmm5, xmm5
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x138b                   // mov    edx, dword [rbx]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x03cea40f               // shld    esi, ecx, 3
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251e3c4; WORD $0x03ea // vpinsrd    xmm5, xmm5, edx, 3
+	LONG $0xf26ef9c5               // vmovd    xmm6, edx
+	LONG $0x5879e2c4; BYTE $0xf6   // vpbroadcastd    xmm6, xmm6
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x14c38348               // add    rbx, 20
+	LONG $0xffc08348               // add    rax, -1
+	JNE  LBB0_19
+	JMP  LBB0_147
+
+LBB0_61:
+	WORD $0xf983; BYTE $0x15       // cmp    ecx, 21
+	JG   LBB0_67
+	WORD $0xf983; BYTE $0x14       // cmp    ecx, 20
+	JE   LBB0_114
+	WORD $0xf983; BYTE $0x15       // cmp    ecx, 21
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x50c38348               // add    rbx, 80
+	QUAD $0x00000420856f7dc5       // vmovdqa    ymm8, yword 1056[rbp] /* [rip + .LCPI0_58] */
+	LONG $0x597de2c4; WORD $0x584d // vpbroadcastq    ymm1, qword 88[rbp] /* [rip + .LCPI0_59] */
+	QUAD $0x00000440956ffdc5       // vmovdqa    ymm2, yword 1088[rbp] /* [rip + .LCPI0_60] */
+	QUAD $0x000004609d6ffdc5       // vmovdqa    ymm3, yword 1120[rbp] /* [rip + .LCPI0_61] */
+	QUAD $0x00000c50a56ff9c5       // vmovdqa    xmm4, oword 3152[rbp] /* [rip + .LCPI0_62] */
+	QUAD $0x00000c60ad6ff9c5       // vmovdqa    xmm5, oword 3168[rbp] /* [rip + .LCPI0_63] */
+	QUAD $0x00000480b56ffdc5       // vmovdqa    ymm6, yword 1152[rbp] /* [rip + .LCPI0_64] */
+
+LBB0_66:
+	WORD $0x4b8b; BYTE $0xc0       // mov    ecx, dword [rbx - 64]
+	LONG $0xc44b8b44               // mov    r9d, dword [rbx - 60]
+	LONG $0xc9a40f41; BYTE $0x0d   // shld    r9d, ecx, 13
+	LONG $0xbc5b8b44               // mov    r11d, dword [rbx - 68]
+	LONG $0xd9a40f44; BYTE $0x02   // shld    ecx, r11d, 2
+	WORD $0x7b8b; BYTE $0xb8       // mov    edi, dword [rbx - 72]
+	WORD $0x8944; BYTE $0xde       // mov    esi, r11d
+	LONG $0x0cfea40f               // shld    esi, edi, 12
+	LONG $0xb0538b44               // mov    r10d, dword [rbx - 80]
+	WORD $0x438b; BYTE $0xb4       // mov    eax, dword [rbx - 76]
+	LONG $0x01c7a40f               // shld    edi, eax, 1
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0xd2a40f44; BYTE $0x0b   // shld    edx, r10d, 11
+	LONG $0x6e79c1c4; BYTE $0xfa   // vmovd    xmm7, r10d
+	LONG $0xc66ef9c5               // vmovd    xmm0, esi
+	LONG $0x2241e3c4; WORD $0x01fa // vpinsrd    xmm7, xmm7, edx, 1
+	LONG $0x2279c3c4; WORD $0x01c3 // vpinsrd    xmm0, xmm0, r11d, 1
+	LONG $0x2241e3c4; WORD $0x02f8 // vpinsrd    xmm7, xmm7, eax, 2
+	LONG $0x2279e3c4; WORD $0x02c1 // vpinsrd    xmm0, xmm0, ecx, 2
+	LONG $0x2241e3c4; WORD $0x03ff // vpinsrd    xmm7, xmm7, edi, 3
+	LONG $0x2279c3c4; WORD $0x03c1 // vpinsrd    xmm0, xmm0, r9d, 3
+	LONG $0x3845e3c4; WORD $0x01c0 // vinserti128    ymm0, ymm7, xmm0, 1
+	LONG $0x457dc2c4; BYTE $0xc0   // vpsrlvd    ymm0, ymm0, ymm8
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa047 // vmovdqu    yword [r15 - 96], ymm0
+	LONG $0xd4538b44               // mov    r10d, dword [rbx - 44]
+	LONG $0xd84b8b44               // mov    r9d, dword [rbx - 40]
+	LONG $0xd1a40f45; BYTE $0x05   // shld    r9d, r10d, 5
+	WORD $0x538b; BYTE $0xd0       // mov    edx, dword [rbx - 48]
+	WORD $0x8944; BYTE $0xd6       // mov    esi, r10d
+	LONG $0x0fd6a40f               // shld    esi, edx, 15
+	WORD $0x4b8b; BYTE $0xcc       // mov    ecx, dword [rbx - 52]
+	LONG $0x04caa40f               // shld    edx, ecx, 4
+	LONG $0xc45b8b44               // mov    r11d, dword [rbx - 60]
+	WORD $0x438b; BYTE $0xc8       // mov    eax, dword [rbx - 56]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x0ec7a40f               // shld    edi, eax, 14
+	LONG $0xd8a40f44; BYTE $0x03   // shld    eax, r11d, 3
+	LONG $0x6e79c1c4; BYTE $0xc3   // vmovd    xmm0, r11d
+	LONG $0xfa6ef9c5               // vmovd    xmm7, edx
+	LONG $0x2279e3c4; WORD $0x01c0 // vpinsrd    xmm0, xmm0, eax, 1
+	LONG $0x2241e3c4; WORD $0x01fe // vpinsrd    xmm7, xmm7, esi, 1
+	LONG $0x2279e3c4; WORD $0x02c7 // vpinsrd    xmm0, xmm0, edi, 2
+	LONG $0x2241c3c4; WORD $0x02fa // vpinsrd    xmm7, xmm7, r10d, 2
+	LONG $0x2279e3c4; WORD $0x03c1 // vpinsrd    xmm0, xmm0, ecx, 3
+	LONG $0x2241c3c4; WORD $0x03f9 // vpinsrd    xmm7, xmm7, r9d, 3
+	LONG $0x387de3c4; WORD $0x01c7 // vinserti128    ymm0, ymm0, xmm7, 1
+	LONG $0x457de2c4; BYTE $0xc2   // vpsrlvd    ymm0, ymm0, ymm2
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc047 // vmovdqu    yword [r15 - 64], ymm0
+	LONG $0xec4b8b44               // mov    r9d, dword [rbx - 20]
+	WORD $0x4b8b; BYTE $0xe8       // mov    ecx, dword [rbx - 24]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xcaa40f41; BYTE $0x12   // shld    r10d, ecx, 18
+	WORD $0x738b; BYTE $0xe4       // mov    esi, dword [rbx - 28]
+	LONG $0x07f1a40f               // shld    ecx, esi, 7
+	WORD $0x7b8b; BYTE $0xe0       // mov    edi, dword [rbx - 32]
+	LONG $0xc66ef9c5               // vmovd    xmm0, esi
+	LONG $0x11fea40f               // shld    esi, edi, 17
+	WORD $0x438b; BYTE $0xd8       // mov    eax, dword [rbx - 40]
+	WORD $0x538b; BYTE $0xdc       // mov    edx, dword [rbx - 36]
+	LONG $0x06d7a40f               // shld    edi, edx, 6
+	LONG $0x10d0ac0f               // shrd    eax, edx, 16
+	LONG $0x2279e3c4; WORD $0x01c1 // vpinsrd    xmm0, xmm0, ecx, 1
+	LONG $0xf86ef9c5               // vmovd    xmm7, eax
+	LONG $0x2279c3c4; WORD $0x02c2 // vpinsrd    xmm0, xmm0, r10d, 2
+	LONG $0x2241e3c4; WORD $0x01fa // vpinsrd    xmm7, xmm7, edx, 1
+	LONG $0x2279c3c4; WORD $0x03c1 // vpinsrd    xmm0, xmm0, r9d, 3
+	LONG $0x2241e3c4; WORD $0x02ff // vpinsrd    xmm7, xmm7, edi, 2
+	LONG $0x2241e3c4; WORD $0x03fe // vpinsrd    xmm7, xmm7, esi, 3
+	LONG $0x3845e3c4; WORD $0x01c0 // vinserti128    ymm0, ymm7, xmm0, 1
+	LONG $0x457de2c4; BYTE $0xc3   // vpsrlvd    ymm0, ymm0, ymm3
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe047 // vmovdqu    yword [r15 - 32], ymm0
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x438b; BYTE $0xfc       // mov    eax, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x0ac2a40f               // shld    edx, eax, 10
+	WORD $0x738b; BYTE $0xf4       // mov    esi, dword [rbx - 12]
+	WORD $0x7b8b; BYTE $0xf8       // mov    edi, dword [rbx - 8]
+	WORD $0xc189                   // mov    ecx, eax
+	LONG $0x14f9a40f               // shld    ecx, edi, 20
+	LONG $0x09f7a40f               // shld    edi, esi, 9
+	LONG $0x437efac5; BYTE $0xec   // vmovq    xmm0, qword [rbx - 20]
+	LONG $0x4579e2c4; BYTE $0xfc   // vpsrlvd    xmm7, xmm0, xmm4
+	LONG $0xc070f9c5; BYTE $0xe5   // vpshufd    xmm0, xmm0, 229
+	LONG $0x2279e3c4; WORD $0x01c6 // vpinsrd    xmm0, xmm0, esi, 1
+	LONG $0x4779e2c4; BYTE $0xc5   // vpsllvd    xmm0, xmm0, xmm5
+	LONG $0xc0ebc1c5               // vpor    xmm0, xmm7, xmm0
+	LONG $0xf96ef9c5               // vmovd    xmm7, ecx
+	LONG $0x2241e3c4; WORD $0x01f8 // vpinsrd    xmm7, xmm7, eax, 1
+	LONG $0x2241e3c4; WORD $0x02fa // vpinsrd    xmm7, xmm7, edx, 2
+	LONG $0x2241c3c4; WORD $0x03f9 // vpinsrd    xmm7, xmm7, r9d, 3
+	LONG $0x2279e3c4; WORD $0x02c6 // vpinsrd    xmm0, xmm0, esi, 2
+	LONG $0x2279e3c4; WORD $0x03c7 // vpinsrd    xmm0, xmm0, edi, 3
+	LONG $0x387de3c4; WORD $0x01c7 // vinserti128    ymm0, ymm0, xmm7, 1
+	LONG $0x457de2c4; BYTE $0xc6   // vpsrlvd    ymm0, ymm0, ymm6
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x07   // vmovdqu    yword [r15], ymm0
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x54c38348               // add    rbx, 84
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_66
+	JMP  LBB0_147
+
+LBB0_37:
+	WORD $0xf983; BYTE $0x0d             // cmp    ecx, 13
+	JG   LBB0_43
+	WORD $0xf983; BYTE $0x0c             // cmp    ecx, 12
+	JE   LBB0_126
+	WORD $0xf983; BYTE $0x0d             // cmp    ecx, 13
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x30c38348                     // add    rbx, 48
+	QUAD $0x00009885597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 152[rbp] /* [rip + .LCPI0_90] */
+	QUAD $0x000006c08d6ffdc5             // vmovdqa    ymm1, yword 1728[rbp] /* [rip + .LCPI0_89] */
+	QUAD $0x000006e0956ffdc5             // vmovdqa    ymm2, yword 1760[rbp] /* [rip + .LCPI0_91] */
+	QUAD $0x000007009d6ffdc5             // vmovdqa    ymm3, yword 1792[rbp] /* [rip + .LCPI0_92] */
+	QUAD $0x00000720a56ffdc5             // vmovdqa    ymm4, yword 1824[rbp] /* [rip + .LCPI0_93] */
+
+LBB0_42:
+	WORD $0x438b; BYTE $0xd8       // mov    eax, dword [rbx - 40]
+	LONG $0xdc4b8b44               // mov    r9d, dword [rbx - 36]
+	LONG $0xc1a40f41; BYTE $0x05   // shld    r9d, eax, 5
+	WORD $0x738b; BYTE $0xd0       // mov    esi, dword [rbx - 48]
+	WORD $0x538b; BYTE $0xd4       // mov    edx, dword [rbx - 44]
+	WORD $0xc189                   // mov    ecx, eax
+	LONG $0x0cd1a40f               // shld    ecx, edx, 12
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x06f7a40f               // shld    edi, esi, 6
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x2251e3c4; WORD $0x01e8 // vpinsrd    xmm5, xmm5, eax, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f6 // vpinsrd    xmm6, xmm6, esi, 1
+	LONG $0x2249e3c4; WORD $0x02f7 // vpinsrd    xmm6, xmm6, edi, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	LONG $0xe84b8b44               // mov    r9d, dword [rbx - 24]
+	WORD $0x4b8b; BYTE $0xe4       // mov    ecx, dword [rbx - 28]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x0acaa40f               // shld    edx, ecx, 10
+	WORD $0x738b; BYTE $0xe0       // mov    esi, dword [rbx - 32]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x04f7a40f               // shld    edi, esi, 4
+	LONG $0xdc538b44               // mov    r10d, dword [rbx - 36]
+	WORD $0xf089                   // mov    eax, esi
+	LONG $0xd0a40f44; BYTE $0x0b   // shld    eax, r10d, 11
+	LONG $0xef6ef9c5               // vmovd    xmm5, edi
+	LONG $0x2251e3c4; WORD $0x01e9 // vpinsrd    xmm5, xmm5, ecx, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0x6e79c1c4; BYTE $0xf2   // vmovd    xmm6, r10d
+	LONG $0x2249e3c4; WORD $0x01f0 // vpinsrd    xmm6, xmm6, eax, 1
+	LONG $0x2249e3c4; WORD $0x02f6 // vpinsrd    xmm6, xmm6, esi, 2
+	LONG $0x2249e3c4; WORD $0x03f6 // vpinsrd    xmm6, xmm6, esi, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	LONG $0xf44b8b44               // mov    r9d, dword [rbx - 12]
+	WORD $0x4b8b; BYTE $0xf0       // mov    ecx, dword [rbx - 16]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x02caa40f               // shld    edx, ecx, 2
+	WORD $0x738b; BYTE $0xe8       // mov    esi, dword [rbx - 24]
+	WORD $0x438b; BYTE $0xec       // mov    eax, dword [rbx - 20]
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x2251e3c4; WORD $0x01e9 // vpinsrd    xmm5, xmm5, ecx, 1
+	LONG $0x09c1a40f               // shld    ecx, eax, 9
+	WORD $0xc789                   // mov    edi, eax
+	LONG $0x03f7a40f               // shld    edi, esi, 3
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f7 // vpinsrd    xmm6, xmm6, edi, 1
+	LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd    xmm6, xmm6, eax, 2
+	LONG $0x2249e3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, ecx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x038b                   // mov    eax, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0x07caa40f               // shld    edx, ecx, 7
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x01f1a40f               // shld    ecx, esi, 1
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	LONG $0x18f7ac0f               // shrd    edi, esi, 24
+	LONG $0xf76ef9c5               // vmovd    xmm6, edi
+	LONG $0x2249e3c4; WORD $0x01f6 // vpinsrd    xmm6, xmm6, esi, 1
+	LONG $0x2249e3c4; WORD $0x02f6 // vpinsrd    xmm6, xmm6, esi, 2
+	LONG $0x2249e3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, ecx, 3
+	LONG $0x2251e3c4; WORD $0x01ea // vpinsrd    xmm5, xmm5, edx, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x34c38348               // add    rbx, 52
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_42
+	JMP  LBB0_147
+
+LBB0_85:
+	WORD $0xf983; BYTE $0x1c       // cmp    ecx, 28
+	JE   LBB0_102
+	WORD $0xf983; BYTE $0x1d       // cmp    ecx, 29
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x597de2c4; WORD $0x1045 // vpbroadcastq    ymm0, qword 16[rbp] /* [rip + .LCPI0_12] */
+	QUAD $0x00000b40856f79c5       // vmovdqa    xmm8, oword 2880[rbp] /* [rip + .LCPI0_13] */
+	QUAD $0x00000b50956f79c5       // vmovdqa    xmm10, oword 2896[rbp] /* [rip + .LCPI0_14] */
+	QUAD $0x000001c09d6f7dc5       // vmovdqa    ymm11, yword 448[rbp] /* [rip + .LCPI0_15] */
+	QUAD $0x00000b60a56f79c5       // vmovdqa    xmm12, oword 2912[rbp] /* [rip + .LCPI0_16] */
+	QUAD $0x00000b70ad6ff9c5       // vmovdqa    xmm5, oword 2928[rbp] /* [rip + .LCPI0_17] */
+	QUAD $0x000001e0b56ffdc5       // vmovdqa    ymm6, yword 480[rbp] /* [rip + .LCPI0_18] */
+	QUAD $0x00000b80bd6ff9c5       // vmovdqa    xmm7, oword 2944[rbp] /* [rip + .LCPI0_19] */
+	QUAD $0x00000b908d6ff9c5       // vmovdqa    xmm1, oword 2960[rbp] /* [rip + .LCPI0_20] */
+	QUAD $0x000002008d6f7dc5       // vmovdqa    ymm9, yword 512[rbp] /* [rip + .LCPI0_21] */
+
+LBB0_89:
+	LONG $0x185b8b44               // mov    r11d, dword [rbx + 24]
+	LONG $0x1c4b8b44               // mov    r9d, dword [rbx + 28]
+	LONG $0xd9a40f45; BYTE $0x15   // shld    r9d, r11d, 21
+	WORD $0x738b; BYTE $0x14       // mov    esi, dword [rbx + 20]
+	LONG $0xf3a40f41; BYTE $0x12   // shld    r11d, esi, 18
+	WORD $0x7b8b; BYTE $0x10       // mov    edi, dword [rbx + 16]
+	LONG $0x0ffea40f               // shld    esi, edi, 15
+	WORD $0x438b; BYTE $0x0c       // mov    eax, dword [rbx + 12]
+	LONG $0x0cc7a40f               // shld    edi, eax, 12
+	WORD $0x538b; BYTE $0x08       // mov    edx, dword [rbx + 8]
+	LONG $0x09d0a40f               // shld    eax, edx, 9
+	WORD $0x8b44; BYTE $0x13       // mov    r10d, dword [rbx]
+	WORD $0x4b8b; BYTE $0x04       // mov    ecx, dword [rbx + 4]
+	LONG $0x06caa40f               // shld    edx, ecx, 6
+	LONG $0xd1a40f44; BYTE $0x03   // shld    ecx, r10d, 3
+	LONG $0x6e79c1c4; BYTE $0xd2   // vmovd    xmm2, r10d
+	LONG $0xdf6ef9c5               // vmovd    xmm3, edi
+	LONG $0x2269e3c4; WORD $0x01d1 // vpinsrd    xmm2, xmm2, ecx, 1
+	LONG $0x2261e3c4; WORD $0x01de // vpinsrd    xmm3, xmm3, esi, 1
+	LONG $0x2269e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2261c3c4; WORD $0x02db // vpinsrd    xmm3, xmm3, r11d, 2
+	LONG $0x2269e3c4; WORD $0x03d0 // vpinsrd    xmm2, xmm2, eax, 3
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x386de3c4; WORD $0x01d3 // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0xd0dbedc5               // vpand    ymm2, ymm2, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa057 // vmovdqu    yword [r15 - 96], ymm2
+	WORD $0x438b; BYTE $0x34       // mov    eax, dword [rbx + 52]
+	LONG $0x384b8b44               // mov    r9d, dword [rbx + 56]
+	LONG $0xc1a40f41; BYTE $0x0d   // shld    r9d, eax, 13
+	WORD $0x538b; BYTE $0x30       // mov    edx, dword [rbx + 48]
+	LONG $0x0ad0a40f               // shld    eax, edx, 10
+	WORD $0x738b; BYTE $0x2c       // mov    esi, dword [rbx + 44]
+	LONG $0x07f2a40f               // shld    edx, esi, 7
+	WORD $0x7b8b; BYTE $0x24       // mov    edi, dword [rbx + 36]
+	WORD $0x4b8b; BYTE $0x28       // mov    ecx, dword [rbx + 40]
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	LONG $0x01f9a40f               // shld    ecx, edi, 1
+	LONG $0x537efac5; BYTE $0x1c   // vmovq    xmm2, qword [rbx + 28]
+	LONG $0x4569c2c4; BYTE $0xd8   // vpsrlvd    xmm3, xmm2, xmm8
+	LONG $0xd270f9c5; BYTE $0xe5   // vpshufd    xmm2, xmm2, 229
+	LONG $0x2269e3c4; WORD $0x01d7 // vpinsrd    xmm2, xmm2, edi, 1
+	LONG $0x4769c2c4; BYTE $0xd2   // vpsllvd    xmm2, xmm2, xmm10
+	LONG $0xd2ebe1c5               // vpor    xmm2, xmm3, xmm2
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x2269e3c4; WORD $0x02d7 // vpinsrd    xmm2, xmm2, edi, 2
+	LONG $0x2269e3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, ecx, 3
+	LONG $0x386de3c4; WORD $0x01d3 // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0x456dc2c4; BYTE $0xd3   // vpsrlvd    ymm2, ymm2, ymm11
+	LONG $0xd0dbedc5               // vpand    ymm2, ymm2, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc057 // vmovdqu    yword [r15 - 64], ymm2
+	WORD $0x438b; BYTE $0x50       // mov    eax, dword [rbx + 80]
+	WORD $0x4b8b; BYTE $0x54       // mov    ecx, dword [rbx + 84]
+	LONG $0x05c1a40f               // shld    ecx, eax, 5
+	WORD $0x538b; BYTE $0x4c       // mov    edx, dword [rbx + 76]
+	WORD $0x738b; BYTE $0x48       // mov    esi, dword [rbx + 72]
+	LONG $0x02d0a40f               // shld    eax, edx, 2
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x1cf7a40f               // shld    edi, esi, 28
+	LONG $0x536ffac5; BYTE $0x38   // vmovdqu    xmm2, oword [rbx + 56]
+	LONG $0x4569c2c4; BYTE $0xdc   // vpsrlvd    xmm3, xmm2, xmm12
+	LONG $0xd270f9c5; BYTE $0xf9   // vpshufd    xmm2, xmm2, 249
+	LONG $0x2269e3c4; WORD $0x03d6 // vpinsrd    xmm2, xmm2, esi, 3
+	LONG $0xe76ef9c5               // vmovd    xmm4, edi
+	LONG $0x2259e3c4; WORD $0x01e2 // vpinsrd    xmm4, xmm4, edx, 1
+	LONG $0x2259e3c4; WORD $0x02e0 // vpinsrd    xmm4, xmm4, eax, 2
+	LONG $0x4769e2c4; BYTE $0xd5   // vpsllvd    xmm2, xmm2, xmm5
+	LONG $0x2259e3c4; WORD $0x03e1 // vpinsrd    xmm4, xmm4, ecx, 3
+	LONG $0xd2ebe1c5               // vpor    xmm2, xmm3, xmm2
+	LONG $0x386de3c4; WORD $0x01d4 // vinserti128    ymm2, ymm2, xmm4, 1
+	LONG $0x456de2c4; BYTE $0xd6   // vpsrlvd    ymm2, ymm2, ymm6
+	LONG $0xd0dbedc5               // vpand    ymm2, ymm2, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe057 // vmovdqu    yword [r15 - 32], ymm2
+	WORD $0x438b; BYTE $0x70       // mov    eax, dword [rbx + 112]
+	WORD $0x4b8b; BYTE $0x6c       // mov    ecx, dword [rbx + 108]
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0x1acaa40f               // shld    edx, ecx, 26
+	WORD $0x738b; BYTE $0x68       // mov    esi, dword [rbx + 104]
+	LONG $0x17f1a40f               // shld    ecx, esi, 23
+	WORD $0x7b8b; BYTE $0x64       // mov    edi, dword [rbx + 100]
+	LONG $0x536ffac5; BYTE $0x54   // vmovdqu    xmm2, oword [rbx + 84]
+	LONG $0x14fea40f               // shld    esi, edi, 20
+	LONG $0x4569e2c4; BYTE $0xdf   // vpsrlvd    xmm3, xmm2, xmm7
+	LONG $0xd270f9c5; BYTE $0xf9   // vpshufd    xmm2, xmm2, 249
+	LONG $0x2269e3c4; WORD $0x03d7 // vpinsrd    xmm2, xmm2, edi, 3
+	LONG $0xe66ef9c5               // vmovd    xmm4, esi
+	LONG $0x2259e3c4; WORD $0x01e1 // vpinsrd    xmm4, xmm4, ecx, 1
+	LONG $0x4769e2c4; BYTE $0xd1   // vpsllvd    xmm2, xmm2, xmm1
+	LONG $0x2259e3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, edx, 2
+	LONG $0x2259e3c4; WORD $0x03e0 // vpinsrd    xmm4, xmm4, eax, 3
+	LONG $0xd2ebe1c5               // vpor    xmm2, xmm3, xmm2
+	LONG $0x386de3c4; WORD $0x01d4 // vinserti128    ymm2, ymm2, xmm4, 1
+	LONG $0x456dc2c4; BYTE $0xd1   // vpsrlvd    ymm2, ymm2, ymm9
+	LONG $0xd0dbedc5               // vpand    ymm2, ymm2, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x17   // vmovdqu    yword [r15], ymm2
+	LONG $0x74c38348               // add    rbx, 116
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_89
+	JMP  LBB0_147
+
+LBB0_9:
+	WORD $0xf983; BYTE $0x02             // cmp    ecx, 2
+	JE   LBB0_141
+	WORD $0xf983; BYTE $0x03             // cmp    ecx, 3
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	QUAD $0x0000e085597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 224[rbp] /* [rip + .LCPI0_127] */
+	QUAD $0x00000a008d6ffdc5             // vmovdqa    ymm1, yword 2560[rbp] /* [rip + .LCPI0_126] */
+	QUAD $0x00000a20956ffdc5             // vmovdqa    ymm2, yword 2592[rbp] /* [rip + .LCPI0_128] */
+	QUAD $0x00000a409d6ffdc5             // vmovdqa    ymm3, yword 2624[rbp] /* [rip + .LCPI0_129] */
+	QUAD $0x00000a60a56ffdc5             // vmovdqa    ymm4, yword 2656[rbp] /* [rip + .LCPI0_130] */
+
+LBB0_13:
+	LONG $0x587de2c4; BYTE $0x2b   // vpbroadcastd    ymm5, dword [rbx]
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	WORD $0x0b8b                   // mov    ecx, dword [rbx]
+	WORD $0x538b; BYTE $0x04       // mov    edx, dword [rbx + 4]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x02cea40f               // shld    esi, ecx, 2
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x2251e3c4; WORD $0x01e9 // vpinsrd    xmm5, xmm5, ecx, 1
+	LONG $0x2251e3c4; WORD $0x02ee // vpinsrd    xmm5, xmm5, esi, 2
+	LONG $0x2251e3c4; WORD $0x03ea // vpinsrd    xmm5, xmm5, edx, 3
+	LONG $0xf26ef9c5               // vmovd    xmm6, edx
+	LONG $0x5879e2c4; BYTE $0xf6   // vpbroadcastd    xmm6, xmm6
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	WORD $0x4b8b; BYTE $0x04       // mov    ecx, dword [rbx + 4]
+	WORD $0x538b; BYTE $0x08       // mov    edx, dword [rbx + 8]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x01cea40f               // shld    esi, ecx, 1
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x5879e2c4; BYTE $0xf5   // vpbroadcastd    xmm6, xmm5
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251e3c4; WORD $0x03ea // vpinsrd    xmm5, xmm5, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	LONG $0x587de2c4; WORD $0x086b // vpbroadcastd    ymm5, dword [rbx + 8]
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x0cc38348               // add    rbx, 12
+	LONG $0xffc08348               // add    rax, -1
+	JNE  LBB0_13
+	JMP  LBB0_147
+
+LBB0_56:
+	WORD $0xf983; BYTE $0x12       // cmp    ecx, 18
+	JE   LBB0_117
+	WORD $0xf983; BYTE $0x13       // cmp    ecx, 19
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x48c38348               // add    rbx, 72
+	LONG $0x597de2c4; WORD $0x6845 // vpbroadcastq    ymm0, qword 104[rbp] /* [rip + .LCPI0_68] */
+	QUAD $0x000004c08d6ffdc5       // vmovdqa    ymm1, yword 1216[rbp] /* [rip + .LCPI0_67] */
+	QUAD $0x000004e0956ffdc5       // vmovdqa    ymm2, yword 1248[rbp] /* [rip + .LCPI0_69] */
+	QUAD $0x000005009d6ffdc5       // vmovdqa    ymm3, yword 1280[rbp] /* [rip + .LCPI0_70] */
+	QUAD $0x00000520a56ffdc5       // vmovdqa    ymm4, yword 1312[rbp] /* [rip + .LCPI0_71] */
+
+LBB0_60:
+	LONG $0xc84b8b44               // mov    r9d, dword [rbx - 56]
+	WORD $0x538b; BYTE $0xc4       // mov    edx, dword [rbx - 60]
+	WORD $0x8944; BYTE $0xce       // mov    esi, r9d
+	LONG $0x0ed6a40f               // shld    esi, edx, 14
+	WORD $0x7b8b; BYTE $0xc0       // mov    edi, dword [rbx - 64]
+	LONG $0xb8538b44               // mov    r10d, dword [rbx - 72]
+	LONG $0x01faa40f               // shld    edx, edi, 1
+	WORD $0x438b; BYTE $0xbc       // mov    eax, dword [rbx - 68]
+	WORD $0xc189                   // mov    ecx, eax
+	LONG $0xd1a40f44; BYTE $0x0d   // shld    ecx, r10d, 13
+	LONG $0xef6ef9c5               // vmovd    xmm5, edi
+	LONG $0x07c7a40f               // shld    edi, eax, 7
+	LONG $0x2251e3c4; WORD $0x01ea // vpinsrd    xmm5, xmm5, edx, 1
+	LONG $0x6e79c1c4; BYTE $0xf2   // vmovd    xmm6, r10d
+	LONG $0x2251e3c4; WORD $0x02ee // vpinsrd    xmm5, xmm5, esi, 2
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd    xmm6, xmm6, eax, 2
+	LONG $0x2249e3c4; WORD $0x03f7 // vpinsrd    xmm6, xmm6, edi, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	LONG $0xd8538b44               // mov    r10d, dword [rbx - 40]
+	LONG $0xdc4b8b44               // mov    r9d, dword [rbx - 36]
+	LONG $0xd1a40f45; BYTE $0x03   // shld    r9d, r10d, 3
+	WORD $0x538b; BYTE $0xd4       // mov    edx, dword [rbx - 44]
+	WORD $0x8944; BYTE $0xd6       // mov    esi, r10d
+	LONG $0x09d6a40f               // shld    esi, edx, 9
+	WORD $0x7b8b; BYTE $0xd0       // mov    edi, dword [rbx - 48]
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x0ffaa40f               // shld    edx, edi, 15
+	WORD $0x4b8b; BYTE $0xc8       // mov    ecx, dword [rbx - 56]
+	WORD $0x438b; BYTE $0xcc       // mov    eax, dword [rbx - 52]
+	LONG $0x02c7a40f               // shld    edi, eax, 2
+	LONG $0x18c1ac0f               // shrd    ecx, eax, 24
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2251c3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, r10d, 2
+	LONG $0x2249e3c4; WORD $0x01f0 // vpinsrd    xmm6, xmm6, eax, 1
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0x2249e3c4; WORD $0x02f7 // vpinsrd    xmm6, xmm6, edi, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	LONG $0xec538b44               // mov    r10d, dword [rbx - 20]
+	LONG $0xf04b8b44               // mov    r9d, dword [rbx - 16]
+	LONG $0xd1a40f45; BYTE $0x0b   // shld    r9d, r10d, 11
+	WORD $0x538b; BYTE $0xe8       // mov    edx, dword [rbx - 24]
+	WORD $0x8944; BYTE $0xd6       // mov    esi, r10d
+	LONG $0xe45b8b44               // mov    r11d, dword [rbx - 28]
+	LONG $0x11d6a40f               // shld    esi, edx, 17
+	WORD $0x4b8b; BYTE $0xdc       // mov    ecx, dword [rbx - 36]
+	WORD $0x438b; BYTE $0xe0       // mov    eax, dword [rbx - 32]
+	LONG $0xdaa40f44; BYTE $0x04   // shld    edx, r11d, 4
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0x0ac7a40f               // shld    edi, eax, 10
+	LONG $0x10c1ac0f               // shrd    ecx, eax, 16
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2251c3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, r10d, 2
+	LONG $0x2249e3c4; WORD $0x01f0 // vpinsrd    xmm6, xmm6, eax, 1
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0x2249e3c4; WORD $0x02f7 // vpinsrd    xmm6, xmm6, edi, 2
+	LONG $0x2249c3c4; WORD $0x03f3 // vpinsrd    xmm6, xmm6, r11d, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	LONG $0xfc5b8b44               // mov    r11d, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x06   // shld    edx, r11d, 6
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0x0ccfa40f               // shld    edi, ecx, 12
+	LONG $0xf0538b44               // mov    r10d, dword [rbx - 16]
+	WORD $0x438b; BYTE $0xf4       // mov    eax, dword [rbx - 12]
+	WORD $0xce89                   // mov    esi, ecx
+	LONG $0x12c6a40f               // shld    esi, eax, 18
+	LONG $0xd0a40f44; BYTE $0x05   // shld    eax, r10d, 5
+	LONG $0x6e79c1c4; BYTE $0xea   // vmovd    xmm5, r10d
+	LONG $0xf76ef9c5               // vmovd    xmm6, edi
+	LONG $0x2251e3c4; WORD $0x01e8 // vpinsrd    xmm5, xmm5, eax, 1
+	LONG $0x2249c3c4; WORD $0x01f3 // vpinsrd    xmm6, xmm6, r11d, 1
+	LONG $0x2251e3c4; WORD $0x02ee // vpinsrd    xmm5, xmm5, esi, 2
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2251e3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, ecx, 3
+	LONG $0x2249c3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, r9d, 3
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x4cc38348               // add    rbx, 76
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_60
+	JMP  LBB0_147
+
+LBB0_32:
+	WORD $0xf983; BYTE $0x0a             // cmp    ecx, 10
+	JE   LBB0_129
+	WORD $0xf983; BYTE $0x0b             // cmp    ecx, 11
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x28c38348                     // add    rbx, 40
+	QUAD $0x0000a885597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 168[rbp] /* [rip + .LCPI0_97] */
+	QUAD $0x000007608d6ffdc5             // vmovdqa    ymm1, yword 1888[rbp] /* [rip + .LCPI0_96] */
+	QUAD $0x00000780956ffdc5             // vmovdqa    ymm2, yword 1920[rbp] /* [rip + .LCPI0_98] */
+	QUAD $0x000007a09d6ffdc5             // vmovdqa    ymm3, yword 1952[rbp] /* [rip + .LCPI0_99] */
+	QUAD $0x000007c0a56ffdc5             // vmovdqa    ymm4, yword 1984[rbp] /* [rip + .LCPI0_100] */
+
+LBB0_36:
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	WORD $0x538b; BYTE $0xd8       // mov    edx, dword [rbx - 40]
+	WORD $0x738b; BYTE $0xdc       // mov    esi, dword [rbx - 36]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x09f7a40f               // shld    edi, esi, 9
+	WORD $0xf089                   // mov    eax, esi
+	LONG $0x0ad0a40f               // shld    eax, edx, 10
+	LONG $0xee6ef9c5               // vmovd    xmm5, esi
+	LONG $0x2251e3c4; WORD $0x01ef // vpinsrd    xmm5, xmm5, edi, 1
+	LONG $0x2251e3c4; WORD $0x02e9 // vpinsrd    xmm5, xmm5, ecx, 2
+	LONG $0x2251e3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, ecx, 3
+	LONG $0xf26ef9c5               // vmovd    xmm6, edx
+	LONG $0x2249e3c4; WORD $0x01f2 // vpinsrd    xmm6, xmm6, edx, 1
+	LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd    xmm6, xmm6, eax, 2
+	LONG $0x2249e3c4; WORD $0x03f6 // vpinsrd    xmm6, xmm6, esi, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	WORD $0x438b; BYTE $0xec       // mov    eax, dword [rbx - 20]
+	WORD $0x4b8b; BYTE $0xe8       // mov    ecx, dword [rbx - 24]
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0x06caa40f               // shld    edx, ecx, 6
+	WORD $0x738b; BYTE $0xe0       // mov    esi, dword [rbx - 32]
+	WORD $0x7b8b; BYTE $0xe4       // mov    edi, dword [rbx - 28]
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x2251e3c4; WORD $0x01e9 // vpinsrd    xmm5, xmm5, ecx, 1
+	LONG $0x07f9a40f               // shld    ecx, edi, 7
+	LONG $0x18feac0f               // shrd    esi, edi, 24
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f7 // vpinsrd    xmm6, xmm6, edi, 1
+	LONG $0x2249e3c4; WORD $0x02f7 // vpinsrd    xmm6, xmm6, edi, 2
+	LONG $0x2249e3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, ecx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	WORD $0x438b; BYTE $0xf4       // mov    eax, dword [rbx - 12]
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	LONG $0x03c1a40f               // shld    ecx, eax, 3
+	LONG $0xec4b8b44               // mov    r9d, dword [rbx - 20]
+	WORD $0x738b; BYTE $0xf0       // mov    esi, dword [rbx - 16]
+	WORD $0xc789                   // mov    edi, eax
+	LONG $0x04f7a40f               // shld    edi, esi, 4
+	WORD $0xf289                   // mov    edx, esi
+	LONG $0xcaa40f44; BYTE $0x05   // shld    edx, r9d, 5
+	LONG $0xef6ef9c5               // vmovd    xmm5, edi
+	LONG $0x2251e3c4; WORD $0x01e8 // vpinsrd    xmm5, xmm5, eax, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251e3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, ecx, 3
+	LONG $0x6e79c1c4; BYTE $0xf1   // vmovd    xmm6, r9d
+	LONG $0x2249e3c4; WORD $0x01f2 // vpinsrd    xmm6, xmm6, edx, 1
+	LONG $0x2249e3c4; WORD $0x02f6 // vpinsrd    xmm6, xmm6, esi, 2
+	LONG $0x2249e3c4; WORD $0x03f6 // vpinsrd    xmm6, xmm6, esi, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x038b                   // mov    eax, dword [rbx]
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	WORD $0x538b; BYTE $0xfc       // mov    edx, dword [rbx - 4]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x01d6a40f               // shld    esi, edx, 1
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x02cfa40f               // shld    edi, ecx, 2
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f7 // vpinsrd    xmm6, xmm6, edi, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x2cc38348               // add    rbx, 44
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_36
+	JMP  LBB0_147
+
+LBB0_79:
+	WORD $0xf983; BYTE $0x1a       // cmp    ecx, 26
+	JE   LBB0_105
+	WORD $0xf983; BYTE $0x1b       // cmp    ecx, 27
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x68c38348               // add    rbx, 104
+	LONG $0x597de2c4; WORD $0x2045 // vpbroadcastq    ymm0, qword 32[rbp] /* [rip + .LCPI0_25] */
+	QUAD $0x000002408d6f7dc5       // vmovdqa    ymm9, yword 576[rbp] /* [rip + .LCPI0_24] */
+	QUAD $0x00000ba0956f79c5       // vmovdqa    xmm10, oword 2976[rbp] /* [rip + .LCPI0_26] */
+	QUAD $0x00000bb09d6f79c5       // vmovdqa    xmm11, oword 2992[rbp] /* [rip + .LCPI0_27] */
+	QUAD $0x00000260a56ffdc5       // vmovdqa    ymm4, yword 608[rbp] /* [rip + .LCPI0_28] */
+	QUAD $0x00000bc0ad6ff9c5       // vmovdqa    xmm5, oword 3008[rbp] /* [rip + .LCPI0_29] */
+	QUAD $0x00000bd0b56ff9c5       // vmovdqa    xmm6, oword 3024[rbp] /* [rip + .LCPI0_30] */
+	QUAD $0x00000280bd6ffdc5       // vmovdqa    ymm7, yword 640[rbp] /* [rip + .LCPI0_31] */
+	QUAD $0x000002a0856f7dc5       // vmovdqa    ymm8, yword 672[rbp] /* [rip + .LCPI0_32] */
+
+LBB0_83:
+	LONG $0xac538b44               // mov    r10d, dword [rbx - 84]
+	LONG $0xb04b8b44               // mov    r9d, dword [rbx - 80]
+	LONG $0xd1a40f45; BYTE $0x03   // shld    r9d, r10d, 3
+	WORD $0x738b; BYTE $0xa8       // mov    esi, dword [rbx - 88]
+	WORD $0x8944; BYTE $0xd7       // mov    edi, r10d
+	LONG $0x19f7a40f               // shld    edi, esi, 25
+	WORD $0x438b; BYTE $0xa4       // mov    eax, dword [rbx - 92]
+	LONG $0x14c6a40f               // shld    esi, eax, 20
+	WORD $0x538b; BYTE $0xa0       // mov    edx, dword [rbx - 96]
+	LONG $0x0fd0a40f               // shld    eax, edx, 15
+	LONG $0x985b8b44               // mov    r11d, dword [rbx - 104]
+	WORD $0x4b8b; BYTE $0x9c       // mov    ecx, dword [rbx - 100]
+	LONG $0x0acaa40f               // shld    edx, ecx, 10
+	LONG $0xd9a40f44; BYTE $0x05   // shld    ecx, r11d, 5
+	LONG $0x6e79c1c4; BYTE $0xcb   // vmovd    xmm1, r11d
+	LONG $0xd66ef9c5               // vmovd    xmm2, esi
+	LONG $0x2271e3c4; WORD $0x01c9 // vpinsrd    xmm1, xmm1, ecx, 1
+	LONG $0x2269e3c4; WORD $0x01d7 // vpinsrd    xmm2, xmm2, edi, 1
+	LONG $0x2271e3c4; WORD $0x02ca // vpinsrd    xmm1, xmm1, edx, 2
+	LONG $0x2269c3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, r10d, 2
+	LONG $0x2271e3c4; WORD $0x03c8 // vpinsrd    xmm1, xmm1, eax, 3
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x3875e3c4; WORD $0x01ca // vinserti128    ymm1, ymm1, xmm2, 1
+	LONG $0x4575c2c4; BYTE $0xc9   // vpsrlvd    ymm1, ymm1, ymm9
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa04f // vmovdqu    yword [r15 - 96], ymm1
+	WORD $0x438b; BYTE $0xc8       // mov    eax, dword [rbx - 56]
+	WORD $0x4b8b; BYTE $0xcc       // mov    ecx, dword [rbx - 52]
+	LONG $0x0bc1a40f               // shld    ecx, eax, 11
+	WORD $0x538b; BYTE $0xc4       // mov    edx, dword [rbx - 60]
+	WORD $0x738b; BYTE $0xc0       // mov    esi, dword [rbx - 64]
+	LONG $0x06d0a40f               // shld    eax, edx, 6
+	LONG $0x01f2a40f               // shld    edx, esi, 1
+	LONG $0x4b6ffac5; BYTE $0xb0   // vmovdqu    xmm1, oword [rbx - 80]
+	LONG $0x4571c2c4; BYTE $0xd2   // vpsrlvd    xmm2, xmm1, xmm10
+	LONG $0xc970f9c5; BYTE $0xf9   // vpshufd    xmm1, xmm1, 249
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2271e3c4; WORD $0x03ce // vpinsrd    xmm1, xmm1, esi, 3
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x4771c2c4; BYTE $0xcb   // vpsllvd    xmm1, xmm1, xmm11
+	LONG $0x2261e3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, ecx, 3
+	LONG $0xc9ebe9c5               // vpor    xmm1, xmm2, xmm1
+	LONG $0x3875e3c4; WORD $0x01cb // vinserti128    ymm1, ymm1, xmm3, 1
+	LONG $0x4575e2c4; BYTE $0xcc   // vpsrlvd    ymm1, ymm1, ymm4
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc04f // vmovdqu    yword [r15 - 64], ymm1
+	WORD $0x438b; BYTE $0xe4       // mov    eax, dword [rbx - 28]
+	LONG $0xe84b8b44               // mov    r9d, dword [rbx - 24]
+	LONG $0xc1a40f41; BYTE $0x13   // shld    r9d, eax, 19
+	WORD $0x538b; BYTE $0xe0       // mov    edx, dword [rbx - 32]
+	LONG $0x0ed0a40f               // shld    eax, edx, 14
+	WORD $0x738b; BYTE $0xdc       // mov    esi, dword [rbx - 36]
+	LONG $0x09f2a40f               // shld    edx, esi, 9
+	LONG $0xd4538b44               // mov    r10d, dword [rbx - 44]
+	WORD $0x7b8b; BYTE $0xd8       // mov    edi, dword [rbx - 40]
+	LONG $0x04fea40f               // shld    esi, edi, 4
+	WORD $0xf989                   // mov    ecx, edi
+	LONG $0xd1a40f44; BYTE $0x1a   // shld    ecx, r10d, 26
+	LONG $0x4b7efac5; BYTE $0xcc   // vmovq    xmm1, qword [rbx - 52]
+	LONG $0x4571e2c4; BYTE $0xd5   // vpsrlvd    xmm2, xmm1, xmm5
+	LONG $0xc970f9c5; BYTE $0xe5   // vpshufd    xmm1, xmm1, 229
+	LONG $0x2271c3c4; WORD $0x01ca // vpinsrd    xmm1, xmm1, r10d, 1
+	LONG $0x4771e2c4; BYTE $0xce   // vpsllvd    xmm1, xmm1, xmm6
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0xc9ebe9c5               // vpor    xmm1, xmm2, xmm1
+	LONG $0x2261e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm3, eax, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x2271e3c4; WORD $0x02c9 // vpinsrd    xmm1, xmm1, ecx, 2
+	LONG $0x2271e3c4; WORD $0x03cf // vpinsrd    xmm1, xmm1, edi, 3
+	LONG $0x3875e3c4; WORD $0x01ca // vinserti128    ymm1, ymm1, xmm2, 1
+	LONG $0x4575e2c4; BYTE $0xcf   // vpsrlvd    ymm1, ymm1, ymm7
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe04f // vmovdqu    yword [r15 - 32], ymm1
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	LONG $0xfc5b8b44               // mov    r11d, dword [rbx - 4]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xdaa40f45; BYTE $0x16   // shld    r10d, r11d, 22
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0xf3a40f41; BYTE $0x11   // shld    r11d, esi, 17
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	WORD $0x438b; BYTE $0xf0       // mov    eax, dword [rbx - 16]
+	LONG $0x0cfea40f               // shld    esi, edi, 12
+	WORD $0x538b; BYTE $0xe8       // mov    edx, dword [rbx - 24]
+	WORD $0x4b8b; BYTE $0xec       // mov    ecx, dword [rbx - 20]
+	LONG $0x07c7a40f               // shld    edi, eax, 7
+	LONG $0x08caac0f               // shrd    edx, ecx, 8
+	LONG $0x02c8a40f               // shld    eax, ecx, 2
+	LONG $0xce6ef9c5               // vmovd    xmm1, esi
+	LONG $0x2271c3c4; WORD $0x01cb // vpinsrd    xmm1, xmm1, r11d, 1
+	LONG $0xd26ef9c5               // vmovd    xmm2, edx
+	LONG $0x2271c3c4; WORD $0x02ca // vpinsrd    xmm1, xmm1, r10d, 2
+	LONG $0x2269e3c4; WORD $0x01d1 // vpinsrd    xmm2, xmm2, ecx, 1
+	LONG $0x2271c3c4; WORD $0x03c9 // vpinsrd    xmm1, xmm1, r9d, 3
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2269e3c4; WORD $0x03d7 // vpinsrd    xmm2, xmm2, edi, 3
+	LONG $0x386de3c4; WORD $0x01c9 // vinserti128    ymm1, ymm2, xmm1, 1
+	LONG $0x4575c2c4; BYTE $0xc8   // vpsrlvd    ymm1, ymm1, ymm8
+	LONG $0xc8dbf5c5               // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x0f   // vmovdqu    yword [r15], ymm1
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x6cc38348               // add    rbx, 108
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_83
+	JMP  LBB0_147
+
+LBB0_20:
+	WORD $0xf983; BYTE $0x06             // cmp    ecx, 6
+	JE   LBB0_135
+	WORD $0xf983; BYTE $0x07             // cmp    ecx, 7
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x18c38348                     // add    rbx, 24
+	QUAD $0x0000c085597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 192[rbp] /* [rip + .LCPI0_112] */
+	QUAD $0x000008a08d6ffdc5             // vmovdqa    ymm1, yword 2208[rbp] /* [rip + .LCPI0_111] */
+	QUAD $0x000008c0956ffdc5             // vmovdqa    ymm2, yword 2240[rbp] /* [rip + .LCPI0_113] */
+	QUAD $0x000008e09d6ffdc5             // vmovdqa    ymm3, yword 2272[rbp] /* [rip + .LCPI0_114] */
+	QUAD $0x00000900a56ffdc5             // vmovdqa    ymm4, yword 2304[rbp] /* [rip + .LCPI0_115] */
+
+LBB0_24:
+	WORD $0x4b8b; BYTE $0xe8       // mov    ecx, dword [rbx - 24]
+	WORD $0x538b; BYTE $0xec       // mov    edx, dword [rbx - 20]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f2 // vpinsrd    xmm6, xmm6, edx, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x5879e2c4; BYTE $0xed   // vpbroadcastd    xmm5, xmm5
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	WORD $0x4b8b; BYTE $0xf4       // mov    ecx, dword [rbx - 12]
+	WORD $0x538b; BYTE $0xec       // mov    edx, dword [rbx - 20]
+	WORD $0x738b; BYTE $0xf0       // mov    esi, dword [rbx - 16]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x05f7a40f               // shld    edi, esi, 5
+	WORD $0xf089                   // mov    eax, esi
+	LONG $0x01d0a40f               // shld    eax, edx, 1
+	LONG $0xee6ef9c5               // vmovd    xmm5, esi
+	LONG $0x2251e3c4; WORD $0x01ef // vpinsrd    xmm5, xmm5, edi, 1
+	LONG $0x2251e3c4; WORD $0x02e9 // vpinsrd    xmm5, xmm5, ecx, 2
+	LONG $0x2251e3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, ecx, 3
+	LONG $0xf26ef9c5               // vmovd    xmm6, edx
+	LONG $0x2249e3c4; WORD $0x01f0 // vpinsrd    xmm6, xmm6, eax, 1
+	LONG $0x2249e3c4; WORD $0x02f6 // vpinsrd    xmm6, xmm6, esi, 2
+	LONG $0x2249e3c4; WORD $0x03f6 // vpinsrd    xmm6, xmm6, esi, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	WORD $0x438b; BYTE $0xfc       // mov    eax, dword [rbx - 4]
+	WORD $0x4b8b; BYTE $0xf4       // mov    ecx, dword [rbx - 12]
+	WORD $0x538b; BYTE $0xf8       // mov    edx, dword [rbx - 8]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x06d6a40f               // shld    esi, edx, 6
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x02cfa40f               // shld    edi, ecx, 2
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x2251e3c4; WORD $0x01ea // vpinsrd    xmm5, xmm5, edx, 1
+	LONG $0x2251e3c4; WORD $0x02ee // vpinsrd    xmm5, xmm5, esi, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f7 // vpinsrd    xmm6, xmm6, edi, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x438b; BYTE $0xfc       // mov    eax, dword [rbx - 4]
+	WORD $0x0b8b                   // mov    ecx, dword [rbx]
+	WORD $0xca89                   // mov    edx, ecx
+	LONG $0x03c2a40f               // shld    edx, eax, 3
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0xf06ef9c5               // vmovd    xmm6, eax
+	LONG $0x2249e3c4; WORD $0x01f0 // vpinsrd    xmm6, xmm6, eax, 1
+	LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd    xmm6, xmm6, eax, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x5879e2c4; BYTE $0xed   // vpbroadcastd    xmm5, xmm5
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x1cc38348               // add    rbx, 28
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_24
+	JMP  LBB0_147
+
+LBB0_67:
+	WORD $0xf983; BYTE $0x16       // cmp    ecx, 22
+	JE   LBB0_111
+	WORD $0xf983; BYTE $0x17       // cmp    ecx, 23
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x58c38348               // add    rbx, 88
+	QUAD $0x00000360856f7dc5       // vmovdqa    ymm8, yword 864[rbp] /* [rip + .LCPI0_48] */
+	LONG $0x597de2c4; WORD $0x484d // vpbroadcastq    ymm1, qword 72[rbp] /* [rip + .LCPI0_49] */
+	QUAD $0x00000c30956ff9c5       // vmovdqa    xmm2, oword 3120[rbp] /* [rip + .LCPI0_50] */
+	QUAD $0x00000c409d6ff9c5       // vmovdqa    xmm3, oword 3136[rbp] /* [rip + .LCPI0_51] */
+	QUAD $0x00000380a56ffdc5       // vmovdqa    ymm4, yword 896[rbp] /* [rip + .LCPI0_52] */
+	QUAD $0x000003a0ad6ffdc5       // vmovdqa    ymm5, yword 928[rbp] /* [rip + .LCPI0_53] */
+	QUAD $0x000003c0b56ffdc5       // vmovdqa    ymm6, yword 960[rbp] /* [rip + .LCPI0_54] */
+
+LBB0_71:
+	LONG $0xbc4b8b44               // mov    r9d, dword [rbx - 68]
+	WORD $0x538b; BYTE $0xb8       // mov    edx, dword [rbx - 72]
+	WORD $0x8945; BYTE $0xcb       // mov    r11d, r9d
+	LONG $0xd3a40f41; BYTE $0x16   // shld    r11d, edx, 22
+	WORD $0x7b8b; BYTE $0xb4       // mov    edi, dword [rbx - 76]
+	LONG $0x0dfaa40f               // shld    edx, edi, 13
+	WORD $0x738b; BYTE $0xb0       // mov    esi, dword [rbx - 80]
+	LONG $0x04f7a40f               // shld    edi, esi, 4
+	LONG $0xa8538b44               // mov    r10d, dword [rbx - 88]
+	WORD $0x4b8b; BYTE $0xac       // mov    ecx, dword [rbx - 84]
+	WORD $0xf089                   // mov    eax, esi
+	LONG $0x12c8a40f               // shld    eax, ecx, 18
+	LONG $0xd1a40f44; BYTE $0x09   // shld    ecx, r10d, 9
+	LONG $0x6e79c1c4; BYTE $0xfa   // vmovd    xmm7, r10d
+	LONG $0xc76ef9c5               // vmovd    xmm0, edi
+	LONG $0x2241e3c4; WORD $0x01f9 // vpinsrd    xmm7, xmm7, ecx, 1
+	LONG $0x2279e3c4; WORD $0x01c2 // vpinsrd    xmm0, xmm0, edx, 1
+	LONG $0x2241e3c4; WORD $0x02f8 // vpinsrd    xmm7, xmm7, eax, 2
+	LONG $0x2279c3c4; WORD $0x02c3 // vpinsrd    xmm0, xmm0, r11d, 2
+	LONG $0x2241e3c4; WORD $0x03fe // vpinsrd    xmm7, xmm7, esi, 3
+	LONG $0x2279c3c4; WORD $0x03c1 // vpinsrd    xmm0, xmm0, r9d, 3
+	LONG $0x3845e3c4; WORD $0x01c0 // vinserti128    ymm0, ymm7, xmm0, 1
+	LONG $0x457dc2c4; BYTE $0xc0   // vpsrlvd    ymm0, ymm0, ymm8
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa047 // vmovdqu    yword [r15 - 96], ymm0
+	WORD $0x438b; BYTE $0xd0       // mov    eax, dword [rbx - 48]
+	LONG $0xd44b8b44               // mov    r9d, dword [rbx - 44]
+	LONG $0xc1a40f41; BYTE $0x07   // shld    r9d, eax, 7
+	WORD $0x538b; BYTE $0xcc       // mov    edx, dword [rbx - 52]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x15d6a40f               // shld    esi, edx, 21
+	WORD $0x7b8b; BYTE $0xc4       // mov    edi, dword [rbx - 60]
+	WORD $0x4b8b; BYTE $0xc8       // mov    ecx, dword [rbx - 56]
+	LONG $0x0ccaa40f               // shld    edx, ecx, 12
+	LONG $0x03f9a40f               // shld    ecx, edi, 3
+	LONG $0x437efac5; BYTE $0xbc   // vmovq    xmm0, qword [rbx - 68]
+	LONG $0x4579e2c4; BYTE $0xfa   // vpsrlvd    xmm7, xmm0, xmm2
+	LONG $0xc070f9c5; BYTE $0xe5   // vpshufd    xmm0, xmm0, 229
+	LONG $0x2279e3c4; WORD $0x01c7 // vpinsrd    xmm0, xmm0, edi, 1
+	LONG $0x4779e2c4; BYTE $0xc3   // vpsllvd    xmm0, xmm0, xmm3
+	LONG $0xc0ebc1c5               // vpor    xmm0, xmm7, xmm0
+	LONG $0xfa6ef9c5               // vmovd    xmm7, edx
+	LONG $0x2241e3c4; WORD $0x01fe // vpinsrd    xmm7, xmm7, esi, 1
+	LONG $0x2241e3c4; WORD $0x02f8 // vpinsrd    xmm7, xmm7, eax, 2
+	LONG $0x2241c3c4; WORD $0x03f9 // vpinsrd    xmm7, xmm7, r9d, 3
+	LONG $0x2279e3c4; WORD $0x02c7 // vpinsrd    xmm0, xmm0, edi, 2
+	LONG $0x2279e3c4; WORD $0x03c1 // vpinsrd    xmm0, xmm0, ecx, 3
+	LONG $0x387de3c4; WORD $0x01c7 // vinserti128    ymm0, ymm0, xmm7, 1
+	LONG $0x457de2c4; BYTE $0xc4   // vpsrlvd    ymm0, ymm0, ymm4
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc047 // vmovdqu    yword [r15 - 64], ymm0
+	LONG $0xe85b8b44               // mov    r11d, dword [rbx - 24]
+	LONG $0xec4b8b44               // mov    r9d, dword [rbx - 20]
+	LONG $0xd9a40f45; BYTE $0x0f   // shld    r9d, r11d, 15
+	LONG $0xe4538b44               // mov    r10d, dword [rbx - 28]
+	LONG $0xd3a40f45; BYTE $0x06   // shld    r11d, r10d, 6
+	WORD $0x738b; BYTE $0xe0       // mov    esi, dword [rbx - 32]
+	WORD $0x8944; BYTE $0xd7       // mov    edi, r10d
+	WORD $0x4b8b; BYTE $0xdc       // mov    ecx, dword [rbx - 36]
+	LONG $0x14f7a40f               // shld    edi, esi, 20
+	WORD $0x538b; BYTE $0xd4       // mov    edx, dword [rbx - 44]
+	WORD $0x438b; BYTE $0xd8       // mov    eax, dword [rbx - 40]
+	LONG $0x0bcea40f               // shld    esi, ecx, 11
+	LONG $0x10c2ac0f               // shrd    edx, eax, 16
+	LONG $0x02c1a40f               // shld    ecx, eax, 2
+	LONG $0xc76ef9c5               // vmovd    xmm0, edi
+	LONG $0x2279c3c4; WORD $0x01c2 // vpinsrd    xmm0, xmm0, r10d, 1
+	LONG $0xfa6ef9c5               // vmovd    xmm7, edx
+	LONG $0x2279c3c4; WORD $0x02c3 // vpinsrd    xmm0, xmm0, r11d, 2
+	LONG $0x2241e3c4; WORD $0x01f8 // vpinsrd    xmm7, xmm7, eax, 1
+	LONG $0x2279c3c4; WORD $0x03c1 // vpinsrd    xmm0, xmm0, r9d, 3
+	LONG $0x2241e3c4; WORD $0x02f9 // vpinsrd    xmm7, xmm7, ecx, 2
+	LONG $0x2241e3c4; WORD $0x03fe // vpinsrd    xmm7, xmm7, esi, 3
+	LONG $0x3845e3c4; WORD $0x01c0 // vinserti128    ymm0, ymm7, xmm0, 1
+	LONG $0x457de2c4; BYTE $0xc5   // vpsrlvd    ymm0, ymm0, ymm5
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe047 // vmovdqu    yword [r15 - 32], ymm0
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x0ecaa40f               // shld    edx, ecx, 14
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0x05f1a40f               // shld    ecx, esi, 5
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	LONG $0xc66ef9c5               // vmovd    xmm0, esi
+	LONG $0x13fea40f               // shld    esi, edi, 19
+	LONG $0xec538b44               // mov    r10d, dword [rbx - 20]
+	WORD $0x438b; BYTE $0xf0       // mov    eax, dword [rbx - 16]
+	LONG $0x0ac7a40f               // shld    edi, eax, 10
+	LONG $0xd0a40f44; BYTE $0x01   // shld    eax, r10d, 1
+	LONG $0x2279e3c4; WORD $0x01c1 // vpinsrd    xmm0, xmm0, ecx, 1
+	LONG $0x6e79c1c4; BYTE $0xfa   // vmovd    xmm7, r10d
+	LONG $0x2279e3c4; WORD $0x02c2 // vpinsrd    xmm0, xmm0, edx, 2
+	LONG $0x2241e3c4; WORD $0x01f8 // vpinsrd    xmm7, xmm7, eax, 1
+	LONG $0x2279c3c4; WORD $0x03c1 // vpinsrd    xmm0, xmm0, r9d, 3
+	LONG $0x2241e3c4; WORD $0x02ff // vpinsrd    xmm7, xmm7, edi, 2
+	LONG $0x2241e3c4; WORD $0x03fe // vpinsrd    xmm7, xmm7, esi, 3
+	LONG $0x3845e3c4; WORD $0x01c0 // vinserti128    ymm0, ymm7, xmm0, 1
+	LONG $0x457de2c4; BYTE $0xc6   // vpsrlvd    ymm0, ymm0, ymm6
+	LONG $0xc1dbfdc5               // vpand    ymm0, ymm0, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x07   // vmovdqu    yword [r15], ymm0
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x5cc38348               // add    rbx, 92
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_71
+	JMP  LBB0_147
+
+LBB0_43:
+	WORD $0xf983; BYTE $0x0e             // cmp    ecx, 14
+	JE   LBB0_123
+	WORD $0xf983; BYTE $0x0f             // cmp    ecx, 15
+	JNE  LBB0_147
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x38c38348                     // add    rbx, 56
+	QUAD $0x00008885597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 136[rbp] /* [rip + .LCPI0_82] */
+	QUAD $0x000006008d6ffdc5             // vmovdqa    ymm1, yword 1536[rbp] /* [rip + .LCPI0_81] */
+	QUAD $0x00000620956ffdc5             // vmovdqa    ymm2, yword 1568[rbp] /* [rip + .LCPI0_83] */
+	QUAD $0x000006409d6ffdc5             // vmovdqa    ymm3, yword 1600[rbp] /* [rip + .LCPI0_84] */
+	QUAD $0x00000660a56ffdc5             // vmovdqa    ymm4, yword 1632[rbp] /* [rip + .LCPI0_85] */
+
+LBB0_47:
+	LONG $0xd44b8b44               // mov    r9d, dword [rbx - 44]
+	WORD $0x438b; BYTE $0xd0       // mov    eax, dword [rbx - 48]
+	WORD $0x8944; BYTE $0xce       // mov    esi, r9d
+	LONG $0x06c6a40f               // shld    esi, eax, 6
+	LONG $0xcc538b44               // mov    r10d, dword [rbx - 52]
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0xd2a40f44; BYTE $0x04   // shld    edx, r10d, 4
+	WORD $0x4b8b; BYTE $0xc8       // mov    ecx, dword [rbx - 56]
+	WORD $0x8944; BYTE $0xd7       // mov    edi, r10d
+	LONG $0x02cfa40f               // shld    edi, ecx, 2
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x2251e3c4; WORD $0x01e8 // vpinsrd    xmm5, xmm5, eax, 1
+	LONG $0x2251e3c4; WORD $0x02ee // vpinsrd    xmm5, xmm5, esi, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0xf16ef9c5               // vmovd    xmm6, ecx
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f7 // vpinsrd    xmm6, xmm6, edi, 2
+	LONG $0x2249c3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, r10d, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xe9   // vpsrlvd    ymm5, ymm5, ymm1
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	LONG $0xe44b8b44               // mov    r9d, dword [rbx - 28]
+	LONG $0xe05b8b44               // mov    r11d, dword [rbx - 32]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x0e   // shld    edx, r11d, 14
+	LONG $0xdc538b44               // mov    r10d, dword [rbx - 36]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0xd7a40f44; BYTE $0x0c   // shld    edi, r10d, 12
+	WORD $0x438b; BYTE $0xd4       // mov    eax, dword [rbx - 44]
+	WORD $0x738b; BYTE $0xd8       // mov    esi, dword [rbx - 40]
+	WORD $0x8944; BYTE $0xd1       // mov    ecx, r10d
+	LONG $0x0af1a40f               // shld    ecx, esi, 10
+	LONG $0x18f0ac0f               // shrd    eax, esi, 24
+	LONG $0xef6ef9c5               // vmovd    xmm5, edi
+	LONG $0x2251c3c4; WORD $0x01eb // vpinsrd    xmm5, xmm5, r11d, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0xf06ef9c5               // vmovd    xmm6, eax
+	LONG $0x2249e3c4; WORD $0x01f6 // vpinsrd    xmm6, xmm6, esi, 1
+	LONG $0x2249e3c4; WORD $0x02f1 // vpinsrd    xmm6, xmm6, ecx, 2
+	LONG $0x2249c3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, r10d, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	WORD $0x438b; BYTE $0xf0       // mov    eax, dword [rbx - 16]
+	LONG $0xf4538b44               // mov    r10d, dword [rbx - 12]
+	LONG $0xc2a40f41; BYTE $0x07   // shld    r10d, eax, 7
+	WORD $0x538b; BYTE $0xec       // mov    edx, dword [rbx - 20]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x05d6a40f               // shld    esi, edx, 5
+	LONG $0xe44b8b44               // mov    r9d, dword [rbx - 28]
+	WORD $0x4b8b; BYTE $0xe8       // mov    ecx, dword [rbx - 24]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0xcfa40f44; BYTE $0x01   // shld    edi, r9d, 1
+	LONG $0xea6ef9c5               // vmovd    xmm5, edx
+	LONG $0x03caa40f               // shld    edx, ecx, 3
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2251e3c4; WORD $0x02e8 // vpinsrd    xmm5, xmm5, eax, 2
+	LONG $0x2251c3c4; WORD $0x03ea // vpinsrd    xmm5, xmm5, r10d, 3
+	LONG $0x6e79c1c4; BYTE $0xf1   // vmovd    xmm6, r9d
+	LONG $0x2249e3c4; WORD $0x01f7 // vpinsrd    xmm6, xmm6, edi, 1
+	LONG $0x2249e3c4; WORD $0x02f1 // vpinsrd    xmm6, xmm6, ecx, 2
+	LONG $0x2249e3c4; WORD $0x03f2 // vpinsrd    xmm6, xmm6, edx, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xeb   // vpsrlvd    ymm5, ymm5, ymm3
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x0dcaa40f               // shld    edx, ecx, 13
+	WORD $0x438b; BYTE $0xf8       // mov    eax, dword [rbx - 8]
+	LONG $0xe96ef9c5               // vmovd    xmm5, ecx
+	LONG $0x0bc1a40f               // shld    ecx, eax, 11
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x09fea40f               // shld    esi, edi, 9
+	LONG $0xf76ef9c5               // vmovd    xmm6, edi
+	LONG $0x2249e3c4; WORD $0x01f6 // vpinsrd    xmm6, xmm6, esi, 1
+	LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd    xmm6, xmm6, eax, 2
+	LONG $0x2249e3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, ecx, 3
+	LONG $0x2251e3c4; WORD $0x01ea // vpinsrd    xmm5, xmm5, edx, 1
+	LONG $0x2251c3c4; WORD $0x02e9 // vpinsrd    xmm5, xmm5, r9d, 2
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0x384de3c4; WORD $0x01ed // vinserti128    ymm5, ymm6, xmm5, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x3cc38348               // add    rbx, 60
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_47
+	JMP  LBB0_147
+
+LBB0_96:
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq    ymm0, qword 0[rbp] /* [rip + .LCPI0_0] */
+	LONG $0x60c78349               // add    r15, 96
+	QUAD $0x00000100856f7dc5       // vmovdqa    ymm8, yword 256[rbp] /* [rip + .LCPI0_1] */
+	QUAD $0x000001208d6f7dc5       // vmovdqa    ymm9, yword 288[rbp] /* [rip + .LCPI0_2] */
+	QUAD $0x00000140956f7dc5       // vmovdqa    ymm10, yword 320[rbp] /* [rip + .LCPI0_3] */
+	QUAD $0x00000160a56ffdc5       // vmovdqa    ymm4, yword 352[rbp] /* [rip + .LCPI0_4] */
+	QUAD $0x00000b00ad6ff9c5       // vmovdqa    xmm5, oword 2816[rbp] /* [rip + .LCPI0_5] */
+	QUAD $0x00000b10b56ff9c5       // vmovdqa    xmm6, oword 2832[rbp] /* [rip + .LCPI0_6] */
+	QUAD $0x00000180bd6ffdc5       // vmovdqa    ymm7, yword 384[rbp] /* [rip + .LCPI0_7] */
+
+LBB0_98:
+	LONG $0x18538b44                           // mov    r10d, dword [rbx + 24]
+	LONG $0x1c4b8b44                           // mov    r9d, dword [rbx + 28]
+	LONG $0xd1a40f45; BYTE $0x07               // shld    r9d, r10d, 7
+	WORD $0x738b; BYTE $0x14                   // mov    esi, dword [rbx + 20]
+	LONG $0xf2a40f41; BYTE $0x06               // shld    r10d, esi, 6
+	WORD $0x7b8b; BYTE $0x10                   // mov    edi, dword [rbx + 16]
+	LONG $0x05fea40f                           // shld    esi, edi, 5
+	WORD $0x438b; BYTE $0x0c                   // mov    eax, dword [rbx + 12]
+	LONG $0x04c7a40f                           // shld    edi, eax, 4
+	WORD $0x538b; BYTE $0x08                   // mov    edx, dword [rbx + 8]
+	LONG $0x03d0a40f                           // shld    eax, edx, 3
+	WORD $0x4b8b; BYTE $0x04                   // mov    ecx, dword [rbx + 4]
+	LONG $0x02caa40f                           // shld    edx, ecx, 2
+	WORD $0x8b44; BYTE $0x1b                   // mov    r11d, dword [rbx]
+	LONG $0xd9a40f44; BYTE $0x01               // shld    ecx, r11d, 1
+	LONG $0xcf6ef9c5                           // vmovd    xmm1, edi
+	LONG $0x2271e3c4; WORD $0x01ce             // vpinsrd    xmm1, xmm1, esi, 1
+	LONG $0x2271c3c4; WORD $0x02ca             // vpinsrd    xmm1, xmm1, r10d, 2
+	LONG $0x2271c3c4; WORD $0x03c9             // vpinsrd    xmm1, xmm1, r9d, 3
+	LONG $0x6e79c1c4; BYTE $0xd3               // vmovd    xmm2, r11d
+	LONG $0x2269e3c4; WORD $0x01d1             // vpinsrd    xmm2, xmm2, ecx, 1
+	LONG $0x2269e3c4; WORD $0x02d2             // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2269e3c4; WORD $0x03d0             // vpinsrd    xmm2, xmm2, eax, 3
+	LONG $0x386de3c4; WORD $0x01c9             // vinserti128    ymm1, ymm2, xmm1, 1
+	LONG $0xc8dbf5c5                           // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa04f             // vmovdqu    yword [r15 - 96], ymm1
+	LONG $0x4b6ffec5; BYTE $0x1c               // vmovdqu    ymm1, yword [rbx + 28]
+	LONG $0x4575c2c4; BYTE $0xc8               // vpsrlvd    ymm1, ymm1, ymm8
+	LONG $0x536ffac5; BYTE $0x2c               // vmovdqu    xmm2, oword [rbx + 44]
+	LONG $0xda70f9c5; BYTE $0xf9               // vpshufd    xmm3, xmm2, 249
+	LONG $0x2261e3c4; WORD $0x3c5b; BYTE $0x03 // vpinsrd    xmm3, xmm3, dword [rbx + 60], 3
+	LONG $0x0f69e3c4; WORD $0x1c53; BYTE $0x04 // vpalignr    xmm2, xmm2, oword [rbx + 28], 4
+	LONG $0x386de3c4; WORD $0x01d3             // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0x476dc2c4; BYTE $0xd1               // vpsllvd    ymm2, ymm2, ymm9
+	LONG $0xcaebf5c5                           // vpor    ymm1, ymm1, ymm2
+	LONG $0xc8dbf5c5                           // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc04f             // vmovdqu    yword [r15 - 64], ymm1
+	LONG $0x4b6ffec5; BYTE $0x3c               // vmovdqu    ymm1, yword [rbx + 60]
+	LONG $0x536ffac5; BYTE $0x4c               // vmovdqu    xmm2, oword [rbx + 76]
+	LONG $0xda70f9c5; BYTE $0xf9               // vpshufd    xmm3, xmm2, 249
+	LONG $0x2261e3c4; WORD $0x5c5b; BYTE $0x03 // vpinsrd    xmm3, xmm3, dword [rbx + 92], 3
+	LONG $0x4575c2c4; BYTE $0xca               // vpsrlvd    ymm1, ymm1, ymm10
+	LONG $0x0f69e3c4; WORD $0x3c53; BYTE $0x04 // vpalignr    xmm2, xmm2, oword [rbx + 60], 4
+	LONG $0x386de3c4; WORD $0x01d3             // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0x476de2c4; BYTE $0xd4               // vpsllvd    ymm2, ymm2, ymm4
+	LONG $0xcaebf5c5                           // vpor    ymm1, ymm1, ymm2
+	LONG $0xc8dbf5c5                           // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe04f             // vmovdqu    yword [r15 - 32], ymm1
+	WORD $0x438b; BYTE $0x78                   // mov    eax, dword [rbx + 120]
+	WORD $0x4b8b; BYTE $0x74                   // mov    ecx, dword [rbx + 116]
+	WORD $0xc289                               // mov    edx, eax
+	LONG $0x1ecaa40f                           // shld    edx, ecx, 30
+	WORD $0x738b; BYTE $0x70                   // mov    esi, dword [rbx + 112]
+	LONG $0x1df1a40f                           // shld    ecx, esi, 29
+	WORD $0x7b8b; BYTE $0x6c                   // mov    edi, dword [rbx + 108]
+	LONG $0x1cfea40f                           // shld    esi, edi, 28
+	LONG $0x4b6ffac5; BYTE $0x5c               // vmovdqu    xmm1, oword [rbx + 92]
+	LONG $0x4571e2c4; BYTE $0xd5               // vpsrlvd    xmm2, xmm1, xmm5
+	LONG $0xc970f9c5; BYTE $0xf9               // vpshufd    xmm1, xmm1, 249
+	LONG $0x2271e3c4; WORD $0x03cf             // vpinsrd    xmm1, xmm1, edi, 3
+	LONG $0x4771e2c4; BYTE $0xce               // vpsllvd    xmm1, xmm1, xmm6
+	LONG $0xde6ef9c5                           // vmovd    xmm3, esi
+	LONG $0x2261e3c4; WORD $0x01d9             // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02da             // vpinsrd    xmm3, xmm3, edx, 2
+	LONG $0x2261e3c4; WORD $0x03d8             // vpinsrd    xmm3, xmm3, eax, 3
+	LONG $0xc9ebe9c5                           // vpor    xmm1, xmm2, xmm1
+	LONG $0x3875e3c4; WORD $0x01cb             // vinserti128    ymm1, ymm1, xmm3, 1
+	LONG $0x4575e2c4; BYTE $0xcf               // vpsrlvd    ymm1, ymm1, ymm7
+	LONG $0xc8dbf5c5                           // vpand    ymm1, ymm1, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x0f               // vmovdqu    yword [r15], ymm1
+	LONG $0x7cc38348                           // add    rbx, 124
+	LONG $0x80ef8349                           // sub    r15, -128
+	LONG $0xffc08349                           // add    r8, -1
+	JNE  LBB0_98
+	JMP  LBB0_147
+
+LBB0_144:
+	WORD $0xfa83; BYTE $0x20 // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf3 // mov    ebx, r14d
+
+LBB0_146:
+	LONG $0x000080ba; BYTE $0x00 // mov    edx, 128
+	WORD $0x894c; BYTE $0xff     // mov    rdi, r15
+	WORD $0xf631                 // xor    esi, esi
+	CALL clib·_memset(SB)
+	LONG $0x80ef8349             // sub    r15, -128
+	LONG $0xffc38348             // add    rbx, -1
+	JNE  LBB0_146
+	JMP  LBB0_147
+
+LBB0_120:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	WORD $0xc931                         // xor    ecx, ecx
+	QUAD $0x00008085597de2c4; BYTE $0x00 // vpbroadcastq    ymm0, qword 128[rbp] /* [rip + .LCPI0_80] */
+	LONG $0xc9eff1c5                     // vpxor    xmm1, xmm1, xmm1
+
+LBB0_122:
+	LONG $0x146ffac5; BYTE $0x0b               // vmovdqu    xmm2, oword [rbx + rcx]
+	LONG $0x00fde3c4; WORD $0xd8d2             // vpermq    ymm2, ymm2, 216
+	LONG $0xd270fdc5; BYTE $0x50               // vpshufd    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0x0e6de3c4; WORD $0xaad1             // vpblendw    ymm2, ymm2, ymm1, 170
+	LONG $0x7f7ec1c4; WORD $0x4f14             // vmovdqu    yword [r15 + 2*rcx], ymm2
+	LONG $0x546ffac5; WORD $0x100b             // vmovdqu    xmm2, oword [rbx + rcx + 16]
+	LONG $0x00fde3c4; WORD $0xd8d2             // vpermq    ymm2, ymm2, 216
+	LONG $0xd270fdc5; BYTE $0x50               // vpshufd    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0x0e6de3c4; WORD $0xaad1             // vpblendw    ymm2, ymm2, ymm1, 170
+	LONG $0x7f7ec1c4; WORD $0x4f54; BYTE $0x20 // vmovdqu    yword [r15 + 2*rcx + 32], ymm2
+	LONG $0x546ffac5; WORD $0x200b             // vmovdqu    xmm2, oword [rbx + rcx + 32]
+	LONG $0x00fde3c4; WORD $0xd8d2             // vpermq    ymm2, ymm2, 216
+	LONG $0xd270fdc5; BYTE $0x50               // vpshufd    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0x0e6de3c4; WORD $0xaad1             // vpblendw    ymm2, ymm2, ymm1, 170
+	LONG $0x7f7ec1c4; WORD $0x4f54; BYTE $0x40 // vmovdqu    yword [r15 + 2*rcx + 64], ymm2
+	LONG $0x546ffac5; WORD $0x300b             // vmovdqu    xmm2, oword [rbx + rcx + 48]
+	LONG $0x00fde3c4; WORD $0xd8d2             // vpermq    ymm2, ymm2, 216
+	LONG $0xd270fdc5; BYTE $0x50               // vpshufd    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0x0e6de3c4; WORD $0xaad1             // vpblendw    ymm2, ymm2, ymm1, 170
+	LONG $0x7f7ec1c4; WORD $0x4f54; BYTE $0x60 // vmovdqu    yword [r15 + 2*rcx + 96], ymm2
+	LONG $0x40c18348                           // add    rcx, 64
+	LONG $0xffc08348                           // add    rax, -1
+	JNE  LBB0_122
+	JMP  LBB0_147
+
+LBB0_132:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	WORD $0xc931                         // xor    ecx, ecx
+	QUAD $0x000c70855a7de2c4; BYTE $0x00 // vbroadcasti128    ymm0, oword 3184[rbp] /* [rip + .LCPI0_109] */
+	QUAD $0x000c848d587de2c4; BYTE $0x00 // vpbroadcastd    ymm1, dword 3204[rbp] /* [rip + .LCPI0_110] */
+
+LBB0_134:
+	LONG $0x147efac5; BYTE $0x0b               // vmovq    xmm2, qword [rbx + rcx]
+	LONG $0xd270f9c5; BYTE $0x50               // vpshufd    xmm2, xmm2, 80
+	LONG $0x00fde3c4; WORD $0x50d2             // vpermq    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0x8f14             // vmovdqu    yword [r15 + 4*rcx], ymm2
+	LONG $0x547efac5; WORD $0x080b             // vmovq    xmm2, qword [rbx + rcx + 8]
+	LONG $0xd270f9c5; BYTE $0x50               // vpshufd    xmm2, xmm2, 80
+	LONG $0x00fde3c4; WORD $0x50d2             // vpermq    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0x8f54; BYTE $0x20 // vmovdqu    yword [r15 + 4*rcx + 32], ymm2
+	LONG $0x547efac5; WORD $0x100b             // vmovq    xmm2, qword [rbx + rcx + 16]
+	LONG $0xd270f9c5; BYTE $0x50               // vpshufd    xmm2, xmm2, 80
+	LONG $0x00fde3c4; WORD $0x50d2             // vpermq    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0x8f54; BYTE $0x40 // vmovdqu    yword [r15 + 4*rcx + 64], ymm2
+	LONG $0x547efac5; WORD $0x180b             // vmovq    xmm2, qword [rbx + rcx + 24]
+	LONG $0xd270f9c5; BYTE $0x50               // vpshufd    xmm2, xmm2, 80
+	LONG $0x00fde3c4; WORD $0x50d2             // vpermq    ymm2, ymm2, 80
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0x8f54; BYTE $0x60 // vmovdqu    yword [r15 + 4*rcx + 96], ymm2
+	LONG $0x20c18348                           // add    rcx, 32
+	LONG $0xffc08348                           // add    rax, -1
+	JNE  LBB0_134
+	JMP  LBB0_147
+
+LBB0_108:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x5cc38348                     // add    rbx, 92
+	QUAD $0x000c20855a7de2c4; BYTE $0x00 // vbroadcasti128    ymm0, oword 3104[rbp] /* [rip + .LCPI0_46] */
+	QUAD $0x000c808d587de2c4; BYTE $0x00 // vpbroadcastd    ymm1, dword 3200[rbp] /* [rip + .LCPI0_47] */
+
+LBB0_110:
+	LONG $0xb84b8b44               // mov    r9d, dword [rbx - 72]
+	WORD $0x538b; BYTE $0xb4       // mov    edx, dword [rbx - 76]
+	WORD $0x8944; BYTE $0xce       // mov    esi, r9d
+	WORD $0x7b8b; BYTE $0xb0       // mov    edi, dword [rbx - 80]
+	LONG $0xac538b44               // mov    r10d, dword [rbx - 84]
+	LONG $0x10d6a40f               // shld    esi, edx, 16
+	LONG $0xa45b8b44               // mov    r11d, dword [rbx - 92]
+	WORD $0x438b; BYTE $0xa8       // mov    eax, dword [rbx - 88]
+	LONG $0x08faa40f               // shld    edx, edi, 8
+	WORD $0x8944; BYTE $0xd1       // mov    ecx, r10d
+	LONG $0x10c1a40f               // shld    ecx, eax, 16
+	LONG $0xd8a40f44; BYTE $0x08   // shld    eax, r11d, 8
+	LONG $0xd76ef9c5               // vmovd    xmm2, edi
+	LONG $0x6e79c1c4; BYTE $0xdb   // vmovd    xmm3, r11d
+	LONG $0x2269e3c4; WORD $0x01d2 // vpinsrd    xmm2, xmm2, edx, 1
+	LONG $0x2261e3c4; WORD $0x01d8 // vpinsrd    xmm3, xmm3, eax, 1
+	LONG $0x2269e3c4; WORD $0x02d6 // vpinsrd    xmm2, xmm2, esi, 2
+	LONG $0x2261e3c4; WORD $0x02d9 // vpinsrd    xmm3, xmm3, ecx, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x2261c3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, r10d, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa057 // vmovdqu    yword [r15 - 96], ymm2
+	LONG $0xd04b8b44               // mov    r9d, dword [rbx - 48]
+	WORD $0x4b8b; BYTE $0xcc       // mov    ecx, dword [rbx - 52]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	WORD $0x738b; BYTE $0xc8       // mov    esi, dword [rbx - 56]
+	LONG $0xc4538b44               // mov    r10d, dword [rbx - 60]
+	LONG $0x10caa40f               // shld    edx, ecx, 16
+	LONG $0xbc5b8b44               // mov    r11d, dword [rbx - 68]
+	WORD $0x7b8b; BYTE $0xc0       // mov    edi, dword [rbx - 64]
+	LONG $0x08f1a40f               // shld    ecx, esi, 8
+	WORD $0x8944; BYTE $0xd0       // mov    eax, r10d
+	LONG $0x10f8a40f               // shld    eax, edi, 16
+	LONG $0xdfa40f44; BYTE $0x08   // shld    edi, r11d, 8
+	LONG $0xd66ef9c5               // vmovd    xmm2, esi
+	LONG $0x6e79c1c4; BYTE $0xdb   // vmovd    xmm3, r11d
+	LONG $0x2269e3c4; WORD $0x01d1 // vpinsrd    xmm2, xmm2, ecx, 1
+	LONG $0x2261e3c4; WORD $0x01df // vpinsrd    xmm3, xmm3, edi, 1
+	LONG $0x2269e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x2261c3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, r10d, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc057 // vmovdqu    yword [r15 - 64], ymm2
+	LONG $0xe84b8b44               // mov    r9d, dword [rbx - 24]
+	WORD $0x4b8b; BYTE $0xe4       // mov    ecx, dword [rbx - 28]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	WORD $0x738b; BYTE $0xe0       // mov    esi, dword [rbx - 32]
+	LONG $0xdc538b44               // mov    r10d, dword [rbx - 36]
+	LONG $0x10caa40f               // shld    edx, ecx, 16
+	LONG $0xd45b8b44               // mov    r11d, dword [rbx - 44]
+	WORD $0x7b8b; BYTE $0xd8       // mov    edi, dword [rbx - 40]
+	LONG $0x08f1a40f               // shld    ecx, esi, 8
+	WORD $0x8944; BYTE $0xd0       // mov    eax, r10d
+	LONG $0x10f8a40f               // shld    eax, edi, 16
+	LONG $0xdfa40f44; BYTE $0x08   // shld    edi, r11d, 8
+	LONG $0xd66ef9c5               // vmovd    xmm2, esi
+	LONG $0x6e79c1c4; BYTE $0xdb   // vmovd    xmm3, r11d
+	LONG $0x2269e3c4; WORD $0x01d1 // vpinsrd    xmm2, xmm2, ecx, 1
+	LONG $0x2261e3c4; WORD $0x01df // vpinsrd    xmm3, xmm3, edi, 1
+	LONG $0x2269e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x2261c3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, r10d, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe057 // vmovdqu    yword [r15 - 32], ymm2
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0xf4538b44               // mov    r10d, dword [rbx - 12]
+	LONG $0x10caa40f               // shld    edx, ecx, 16
+	LONG $0xec5b8b44               // mov    r11d, dword [rbx - 20]
+	WORD $0x7b8b; BYTE $0xf0       // mov    edi, dword [rbx - 16]
+	LONG $0x08f1a40f               // shld    ecx, esi, 8
+	WORD $0x8944; BYTE $0xd0       // mov    eax, r10d
+	LONG $0x10f8a40f               // shld    eax, edi, 16
+	LONG $0xdfa40f44; BYTE $0x08   // shld    edi, r11d, 8
+	LONG $0xd66ef9c5               // vmovd    xmm2, esi
+	LONG $0x2269e3c4; WORD $0x01d1 // vpinsrd    xmm2, xmm2, ecx, 1
+	LONG $0x6e79c1c4; BYTE $0xdb   // vmovd    xmm3, r11d
+	LONG $0x2269e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2261e3c4; WORD $0x01df // vpinsrd    xmm3, xmm3, edi, 1
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2261c3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, r10d, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x17   // vmovdqu    yword [r15], ymm2
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x60c38348               // add    rbx, 96
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_110
+	JMP  LBB0_147
+
+LBB0_138:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	WORD $0xc931                         // xor    ecx, ecx
+	QUAD $0x000009e0856ffdc5             // vmovdqa    ymm0, yword 2528[rbp] /* [rip + .LCPI0_124] */
+	QUAD $0x0000d88d597de2c4; BYTE $0x00 // vpbroadcastq    ymm1, qword 216[rbp] /* [rip + .LCPI0_125] */
+
+LBB0_140:
+	LONG $0x587de2c4; WORD $0x0b14             // vpbroadcastd    ymm2, dword [rbx + rcx]
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xcf14             // vmovdqu    yword [r15 + 8*rcx], ymm2
+	LONG $0x587de2c4; WORD $0x0b54; BYTE $0x04 // vpbroadcastd    ymm2, dword [rbx + rcx + 4]
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xcf54; BYTE $0x20 // vmovdqu    yword [r15 + 8*rcx + 32], ymm2
+	LONG $0x587de2c4; WORD $0x0b54; BYTE $0x08 // vpbroadcastd    ymm2, dword [rbx + rcx + 8]
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xcf54; BYTE $0x40 // vmovdqu    yword [r15 + 8*rcx + 64], ymm2
+	LONG $0x587de2c4; WORD $0x0b54; BYTE $0x0c // vpbroadcastd    ymm2, dword [rbx + rcx + 12]
+	LONG $0x456de2c4; BYTE $0xd0               // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5                           // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xcf54; BYTE $0x60 // vmovdqu    yword [r15 + 8*rcx + 96], ymm2
+	LONG $0x10c18348                           // add    rcx, 16
+	LONG $0xffc08348                           // add    rax, -1
+	JNE  LBB0_140
+	JMP  LBB0_147
+
+LBB0_114:
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x4cc38348               // add    rbx, 76
+	QUAD $0x000004a0856ffdc5       // vmovdqa    ymm0, yword 1184[rbp] /* [rip + .LCPI0_65] */
+	LONG $0x597de2c4; WORD $0x604d // vpbroadcastq    ymm1, qword 96[rbp] /* [rip + .LCPI0_66] */
+
+LBB0_116:
+	LONG $0xc44b8b44               // mov    r9d, dword [rbx - 60]
+	LONG $0xc05b8b44               // mov    r11d, dword [rbx - 64]
+	WORD $0x8944; BYTE $0xce       // mov    esi, r9d
+	LONG $0xdea40f44; BYTE $0x08   // shld    esi, r11d, 8
+	WORD $0x7b8b; BYTE $0xbc       // mov    edi, dword [rbx - 68]
+	WORD $0x8944; BYTE $0xda       // mov    edx, r11d
+	LONG $0x10faa40f               // shld    edx, edi, 16
+	WORD $0x438b; BYTE $0xb8       // mov    eax, dword [rbx - 72]
+	LONG $0x04c7a40f               // shld    edi, eax, 4
+	LONG $0xb4538b44               // mov    r10d, dword [rbx - 76]
+	WORD $0xc189                   // mov    ecx, eax
+	LONG $0xd1a40f44; BYTE $0x0c   // shld    ecx, r10d, 12
+	LONG $0xd26ef9c5               // vmovd    xmm2, edx
+	LONG $0x2269c3c4; WORD $0x01d3 // vpinsrd    xmm2, xmm2, r11d, 1
+	LONG $0x2269e3c4; WORD $0x02d6 // vpinsrd    xmm2, xmm2, esi, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x6e79c1c4; BYTE $0xda   // vmovd    xmm3, r10d
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2261e3c4; WORD $0x03df // vpinsrd    xmm3, xmm3, edi, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa057 // vmovdqu    yword [r15 - 96], ymm2
+	LONG $0xd84b8b44               // mov    r9d, dword [rbx - 40]
+	LONG $0xd45b8b44               // mov    r11d, dword [rbx - 44]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x08   // shld    edx, r11d, 8
+	WORD $0x738b; BYTE $0xd0       // mov    esi, dword [rbx - 48]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0x10f7a40f               // shld    edi, esi, 16
+	LONG $0xc8538b44               // mov    r10d, dword [rbx - 56]
+	WORD $0x4b8b; BYTE $0xcc       // mov    ecx, dword [rbx - 52]
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	WORD $0xc889                   // mov    eax, ecx
+	LONG $0xd0a40f44; BYTE $0x0c   // shld    eax, r10d, 12
+	LONG $0xd76ef9c5               // vmovd    xmm2, edi
+	LONG $0x2269c3c4; WORD $0x01d3 // vpinsrd    xmm2, xmm2, r11d, 1
+	LONG $0x2269e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x6e79c1c4; BYTE $0xda   // vmovd    xmm3, r10d
+	LONG $0x2261e3c4; WORD $0x01d8 // vpinsrd    xmm3, xmm3, eax, 1
+	LONG $0x2261e3c4; WORD $0x02d9 // vpinsrd    xmm3, xmm3, ecx, 2
+	LONG $0x2261e3c4; WORD $0x03de // vpinsrd    xmm3, xmm3, esi, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc057 // vmovdqu    yword [r15 - 64], ymm2
+	LONG $0xec4b8b44               // mov    r9d, dword [rbx - 20]
+	LONG $0xe85b8b44               // mov    r11d, dword [rbx - 24]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x08   // shld    edx, r11d, 8
+	WORD $0x738b; BYTE $0xe4       // mov    esi, dword [rbx - 28]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0x10f7a40f               // shld    edi, esi, 16
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	LONG $0xdc538b44               // mov    r10d, dword [rbx - 36]
+	WORD $0xc889                   // mov    eax, ecx
+	LONG $0xd0a40f44; BYTE $0x0c   // shld    eax, r10d, 12
+	LONG $0xd76ef9c5               // vmovd    xmm2, edi
+	LONG $0x2269c3c4; WORD $0x01d3 // vpinsrd    xmm2, xmm2, r11d, 1
+	LONG $0x2269e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x6e79c1c4; BYTE $0xda   // vmovd    xmm3, r10d
+	LONG $0x2261e3c4; WORD $0x01d8 // vpinsrd    xmm3, xmm3, eax, 1
+	LONG $0x2261e3c4; WORD $0x02d9 // vpinsrd    xmm3, xmm3, ecx, 2
+	LONG $0x2261e3c4; WORD $0x03de // vpinsrd    xmm3, xmm3, esi, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe057 // vmovdqu    yword [r15 - 32], ymm2
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	LONG $0xfc5b8b44               // mov    r11d, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x08   // shld    edx, r11d, 8
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0x10f7a40f               // shld    edi, esi, 16
+	LONG $0xf0538b44               // mov    r10d, dword [rbx - 16]
+	WORD $0x4b8b; BYTE $0xf4       // mov    ecx, dword [rbx - 12]
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	WORD $0xc889                   // mov    eax, ecx
+	LONG $0xd0a40f44; BYTE $0x0c   // shld    eax, r10d, 12
+	LONG $0xd76ef9c5               // vmovd    xmm2, edi
+	LONG $0x2269c3c4; WORD $0x01d3 // vpinsrd    xmm2, xmm2, r11d, 1
+	LONG $0x2269e3c4; WORD $0x02d2 // vpinsrd    xmm2, xmm2, edx, 2
+	LONG $0x2269c3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, r9d, 3
+	LONG $0x6e79c1c4; BYTE $0xda   // vmovd    xmm3, r10d
+	LONG $0x2261e3c4; WORD $0x01d8 // vpinsrd    xmm3, xmm3, eax, 1
+	LONG $0x2261e3c4; WORD $0x02d9 // vpinsrd    xmm3, xmm3, ecx, 2
+	LONG $0x2261e3c4; WORD $0x03de // vpinsrd    xmm3, xmm3, esi, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x17   // vmovdqu    yword [r15], ymm2
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x50c38348               // add    rbx, 80
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_116
+	JMP  LBB0_147
+
+LBB0_126:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x2cc38348                     // add    rbx, 44
+	QUAD $0x00000740856ffdc5             // vmovdqa    ymm0, yword 1856[rbp] /* [rip + .LCPI0_94] */
+	QUAD $0x0000a08d597de2c4; BYTE $0x00 // vpbroadcastq    ymm1, qword 160[rbp] /* [rip + .LCPI0_95] */
+
+LBB0_128:
+	WORD $0x4b8b; BYTE $0xdc       // mov    ecx, dword [rbx - 36]
+	WORD $0x538b; BYTE $0xd4       // mov    edx, dword [rbx - 44]
+	WORD $0x738b; BYTE $0xd8       // mov    esi, dword [rbx - 40]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x04f7a40f               // shld    edi, esi, 4
+	WORD $0xf089                   // mov    eax, esi
+	LONG $0x08d0a40f               // shld    eax, edx, 8
+	LONG $0xd66ef9c5               // vmovd    xmm2, esi
+	LONG $0x2269e3c4; WORD $0x01d7 // vpinsrd    xmm2, xmm2, edi, 1
+	LONG $0x2269e3c4; WORD $0x02d1 // vpinsrd    xmm2, xmm2, ecx, 2
+	LONG $0x2269e3c4; WORD $0x03d1 // vpinsrd    xmm2, xmm2, ecx, 3
+	LONG $0xda6ef9c5               // vmovd    xmm3, edx
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2261e3c4; WORD $0x03de // vpinsrd    xmm3, xmm3, esi, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa057 // vmovdqu    yword [r15 - 96], ymm2
+	WORD $0x438b; BYTE $0xe8       // mov    eax, dword [rbx - 24]
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	WORD $0x538b; BYTE $0xe4       // mov    edx, dword [rbx - 28]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x04d6a40f               // shld    esi, edx, 4
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x08cfa40f               // shld    edi, ecx, 8
+	LONG $0xd26ef9c5               // vmovd    xmm2, edx
+	LONG $0x2269e3c4; WORD $0x01d6 // vpinsrd    xmm2, xmm2, esi, 1
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2269e3c4; WORD $0x03d0 // vpinsrd    xmm2, xmm2, eax, 3
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02df // vpinsrd    xmm3, xmm3, edi, 2
+	LONG $0x2261e3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, edx, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc057 // vmovdqu    yword [r15 - 64], ymm2
+	WORD $0x438b; BYTE $0xf4       // mov    eax, dword [rbx - 12]
+	WORD $0x4b8b; BYTE $0xec       // mov    ecx, dword [rbx - 20]
+	WORD $0x538b; BYTE $0xf0       // mov    edx, dword [rbx - 16]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x04d6a40f               // shld    esi, edx, 4
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x08cfa40f               // shld    edi, ecx, 8
+	LONG $0xd26ef9c5               // vmovd    xmm2, edx
+	LONG $0x2269e3c4; WORD $0x01d6 // vpinsrd    xmm2, xmm2, esi, 1
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2269e3c4; WORD $0x03d0 // vpinsrd    xmm2, xmm2, eax, 3
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02df // vpinsrd    xmm3, xmm3, edi, 2
+	LONG $0x2261e3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, edx, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe057 // vmovdqu    yword [r15 - 32], ymm2
+	WORD $0x038b                   // mov    eax, dword [rbx]
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	WORD $0x538b; BYTE $0xfc       // mov    edx, dword [rbx - 4]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x04d6a40f               // shld    esi, edx, 4
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x08cfa40f               // shld    edi, ecx, 8
+	LONG $0xd26ef9c5               // vmovd    xmm2, edx
+	LONG $0x2269e3c4; WORD $0x01d6 // vpinsrd    xmm2, xmm2, esi, 1
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2269e3c4; WORD $0x03d0 // vpinsrd    xmm2, xmm2, eax, 3
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02df // vpinsrd    xmm3, xmm3, edi, 2
+	LONG $0x2261e3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, edx, 3
+	LONG $0x3865e3c4; WORD $0x01d2 // vinserti128    ymm2, ymm3, xmm2, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x17   // vmovdqu    yword [r15], ymm2
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x30c38348               // add    rbx, 48
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_128
+	JMP  LBB0_147
+
+LBB0_102:
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x6cc38348               // add    rbx, 108
+	QUAD $0x00000220856ffdc5       // vmovdqa    ymm0, yword 544[rbp] /* [rip + .LCPI0_22] */
+	LONG $0x597de2c4; WORD $0x184d // vpbroadcastq    ymm1, qword 24[rbp] /* [rip + .LCPI0_23] */
+
+LBB0_104:
+	LONG $0xac4b8b44               // mov    r9d, dword [rbx - 84]
+	WORD $0x538b; BYTE $0xa8       // mov    edx, dword [rbx - 88]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xd2a40f41; BYTE $0x18   // shld    r10d, edx, 24
+	WORD $0x7b8b; BYTE $0xa4       // mov    edi, dword [rbx - 92]
+	LONG $0x14faa40f               // shld    edx, edi, 20
+	WORD $0x438b; BYTE $0xa0       // mov    eax, dword [rbx - 96]
+	LONG $0x10c7a40f               // shld    edi, eax, 16
+	WORD $0x4b8b; BYTE $0x9c       // mov    ecx, dword [rbx - 100]
+	LONG $0x0cc8a40f               // shld    eax, ecx, 12
+	LONG $0x945b8b44               // mov    r11d, dword [rbx - 108]
+	WORD $0x738b; BYTE $0x98       // mov    esi, dword [rbx - 104]
+	LONG $0x08f1a40f               // shld    ecx, esi, 8
+	LONG $0xdea40f44; BYTE $0x04   // shld    esi, r11d, 4
+	LONG $0x6e79c1c4; BYTE $0xd3   // vmovd    xmm2, r11d
+	LONG $0xdf6ef9c5               // vmovd    xmm3, edi
+	LONG $0x2269e3c4; WORD $0x01d6 // vpinsrd    xmm2, xmm2, esi, 1
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0x2269e3c4; WORD $0x02d1 // vpinsrd    xmm2, xmm2, ecx, 2
+	LONG $0x2261c3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, r10d, 2
+	LONG $0x2269e3c4; WORD $0x03d0 // vpinsrd    xmm2, xmm2, eax, 3
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x386de3c4; WORD $0x01d3 // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa057 // vmovdqu    yword [r15 - 96], ymm2
+	LONG $0xc84b8b44               // mov    r9d, dword [rbx - 56]
+	WORD $0x4b8b; BYTE $0xc4       // mov    ecx, dword [rbx - 60]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xcaa40f41; BYTE $0x18   // shld    r10d, ecx, 24
+	WORD $0x738b; BYTE $0xc0       // mov    esi, dword [rbx - 64]
+	LONG $0x14f1a40f               // shld    ecx, esi, 20
+	WORD $0x7b8b; BYTE $0xbc       // mov    edi, dword [rbx - 68]
+	LONG $0x10fea40f               // shld    esi, edi, 16
+	WORD $0x438b; BYTE $0xb8       // mov    eax, dword [rbx - 72]
+	LONG $0x0cc7a40f               // shld    edi, eax, 12
+	LONG $0xb05b8b44               // mov    r11d, dword [rbx - 80]
+	WORD $0x538b; BYTE $0xb4       // mov    edx, dword [rbx - 76]
+	LONG $0x08d0a40f               // shld    eax, edx, 8
+	LONG $0xdaa40f44; BYTE $0x04   // shld    edx, r11d, 4
+	LONG $0x6e79c1c4; BYTE $0xd3   // vmovd    xmm2, r11d
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2269e3c4; WORD $0x01d2 // vpinsrd    xmm2, xmm2, edx, 1
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2261c3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, r10d, 2
+	LONG $0x2269e3c4; WORD $0x03d7 // vpinsrd    xmm2, xmm2, edi, 3
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x386de3c4; WORD $0x01d3 // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc057 // vmovdqu    yword [r15 - 64], ymm2
+	LONG $0xe44b8b44               // mov    r9d, dword [rbx - 28]
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xcaa40f41; BYTE $0x18   // shld    r10d, ecx, 24
+	WORD $0x738b; BYTE $0xdc       // mov    esi, dword [rbx - 36]
+	LONG $0x14f1a40f               // shld    ecx, esi, 20
+	WORD $0x7b8b; BYTE $0xd8       // mov    edi, dword [rbx - 40]
+	LONG $0x10fea40f               // shld    esi, edi, 16
+	WORD $0x438b; BYTE $0xd4       // mov    eax, dword [rbx - 44]
+	LONG $0x0cc7a40f               // shld    edi, eax, 12
+	LONG $0xcc5b8b44               // mov    r11d, dword [rbx - 52]
+	WORD $0x538b; BYTE $0xd0       // mov    edx, dword [rbx - 48]
+	LONG $0x08d0a40f               // shld    eax, edx, 8
+	LONG $0xdaa40f44; BYTE $0x04   // shld    edx, r11d, 4
+	LONG $0x6e79c1c4; BYTE $0xd3   // vmovd    xmm2, r11d
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2269e3c4; WORD $0x01d2 // vpinsrd    xmm2, xmm2, edx, 1
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2261c3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, r10d, 2
+	LONG $0x2269e3c4; WORD $0x03d7 // vpinsrd    xmm2, xmm2, edi, 3
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x386de3c4; WORD $0x01d3 // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe057 // vmovdqu    yword [r15 - 32], ymm2
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xcaa40f41; BYTE $0x18   // shld    r10d, ecx, 24
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0x14f1a40f               // shld    ecx, esi, 20
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	LONG $0x10fea40f               // shld    esi, edi, 16
+	WORD $0x438b; BYTE $0xf0       // mov    eax, dword [rbx - 16]
+	LONG $0x0cc7a40f               // shld    edi, eax, 12
+	LONG $0xe85b8b44               // mov    r11d, dword [rbx - 24]
+	WORD $0x538b; BYTE $0xec       // mov    edx, dword [rbx - 20]
+	LONG $0x08d0a40f               // shld    eax, edx, 8
+	LONG $0xdaa40f44; BYTE $0x04   // shld    edx, r11d, 4
+	LONG $0x6e79c1c4; BYTE $0xd3   // vmovd    xmm2, r11d
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2269e3c4; WORD $0x01d2 // vpinsrd    xmm2, xmm2, edx, 1
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2269e3c4; WORD $0x02d0 // vpinsrd    xmm2, xmm2, eax, 2
+	LONG $0x2261c3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, r10d, 2
+	LONG $0x2269e3c4; WORD $0x03d7 // vpinsrd    xmm2, xmm2, edi, 3
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x386de3c4; WORD $0x01d3 // vinserti128    ymm2, ymm2, xmm3, 1
+	LONG $0x456de2c4; BYTE $0xd0   // vpsrlvd    ymm2, ymm2, ymm0
+	LONG $0xd1dbedc5               // vpand    ymm2, ymm2, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x17   // vmovdqu    yword [r15], ymm2
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x70c38348               // add    rbx, 112
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_104
+	JMP  LBB0_147
+
+LBB0_141:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	WORD $0xc931                         // xor    ecx, ecx
+	QUAD $0x00000a80856ffdc5             // vmovdqa    ymm0, yword 2688[rbp] /* [rip + .LCPI0_131] */
+	QUAD $0x0000e88d597de2c4; BYTE $0x00 // vpbroadcastq    ymm1, qword 232[rbp] /* [rip + .LCPI0_132] */
+	QUAD $0x00000aa0956ffdc5             // vmovdqa    ymm2, yword 2720[rbp] /* [rip + .LCPI0_133] */
+
+LBB0_143:
+	LONG $0x587de2c4; WORD $0xcb1c             // vpbroadcastd    ymm3, dword [rbx + 8*rcx]
+	LONG $0x4565e2c4; BYTE $0xd8               // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5                           // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa05f             // vmovdqu    yword [r15 - 96], ymm3
+	LONG $0x587de2c4; WORD $0xcb1c             // vpbroadcastd    ymm3, dword [rbx + 8*rcx]
+	LONG $0x4565e2c4; BYTE $0xda               // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5                           // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc05f             // vmovdqu    yword [r15 - 64], ymm3
+	LONG $0x587de2c4; WORD $0xcb5c; BYTE $0x04 // vpbroadcastd    ymm3, dword [rbx + 8*rcx + 4]
+	LONG $0x4565e2c4; BYTE $0xd8               // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5                           // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe05f             // vmovdqu    yword [r15 - 32], ymm3
+	LONG $0x587de2c4; WORD $0xcb5c; BYTE $0x04 // vpbroadcastd    ymm3, dword [rbx + 8*rcx + 4]
+	LONG $0x4565e2c4; BYTE $0xda               // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5                           // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x1f               // vmovdqu    yword [r15], ymm3
+	LONG $0x01c18348                           // add    rcx, 1
+	LONG $0x80ef8349                           // sub    r15, -128
+	WORD $0x3948; BYTE $0xc8                   // cmp    rax, rcx
+	JNE  LBB0_143
+	JMP  LBB0_147
+
+LBB0_117:
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x44c38348               // add    rbx, 68
+	QUAD $0x00000540856ffdc5       // vmovdqa    ymm0, yword 1344[rbp] /* [rip + .LCPI0_72] */
+	LONG $0x597de2c4; WORD $0x704d // vpbroadcastq    ymm1, qword 112[rbp] /* [rip + .LCPI0_73] */
+	QUAD $0x00000560956ffdc5       // vmovdqa    ymm2, yword 1376[rbp] /* [rip + .LCPI0_74] */
+
+LBB0_119:
+	WORD $0x4b8b; BYTE $0xc8       // mov    ecx, dword [rbx - 56]
+	LONG $0xcc538b44               // mov    r10d, dword [rbx - 52]
+	LONG $0xcaa40f41; BYTE $0x02   // shld    r10d, ecx, 2
+	WORD $0x738b; BYTE $0xc4       // mov    esi, dword [rbx - 60]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x06f7a40f               // shld    edi, esi, 6
+	LONG $0xbc4b8b44               // mov    r9d, dword [rbx - 68]
+	WORD $0x538b; BYTE $0xc0       // mov    edx, dword [rbx - 64]
+	WORD $0xd089                   // mov    eax, edx
+	LONG $0xc8a40f44; BYTE $0x0e   // shld    eax, r9d, 14
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x0ad6a40f               // shld    esi, edx, 10
+	LONG $0x2261e3c4; WORD $0x01df // vpinsrd    xmm3, xmm3, edi, 1
+	LONG $0x2261e3c4; WORD $0x02d9 // vpinsrd    xmm3, xmm3, ecx, 2
+	LONG $0x2261c3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, r10d, 3
+	LONG $0x6e79c1c4; BYTE $0xe1   // vmovd    xmm4, r9d
+	LONG $0x2259e3c4; WORD $0x01e0 // vpinsrd    xmm4, xmm4, eax, 1
+	LONG $0x2259e3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, edx, 2
+	LONG $0x2259e3c4; WORD $0x03e6 // vpinsrd    xmm4, xmm4, esi, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa05f // vmovdqu    yword [r15 - 96], ymm3
+	LONG $0xdc4b8b44               // mov    r9d, dword [rbx - 36]
+	LONG $0xd85b8b44               // mov    r11d, dword [rbx - 40]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x04   // shld    edx, r11d, 4
+	LONG $0xd4538b44               // mov    r10d, dword [rbx - 44]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0xd7a40f44; BYTE $0x08   // shld    edi, r10d, 8
+	WORD $0x438b; BYTE $0xcc       // mov    eax, dword [rbx - 52]
+	WORD $0x738b; BYTE $0xd0       // mov    esi, dword [rbx - 48]
+	WORD $0x8944; BYTE $0xd1       // mov    ecx, r10d
+	LONG $0x0cf1a40f               // shld    ecx, esi, 12
+	LONG $0x10f0ac0f               // shrd    eax, esi, 16
+	LONG $0xdf6ef9c5               // vmovd    xmm3, edi
+	LONG $0x2261c3c4; WORD $0x01db // vpinsrd    xmm3, xmm3, r11d, 1
+	LONG $0x2261e3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, edx, 2
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0xe06ef9c5               // vmovd    xmm4, eax
+	LONG $0x2259e3c4; WORD $0x01e6 // vpinsrd    xmm4, xmm4, esi, 1
+	LONG $0x2259e3c4; WORD $0x02e1 // vpinsrd    xmm4, xmm4, ecx, 2
+	LONG $0x2259c3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, r10d, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc05f // vmovdqu    yword [r15 - 64], ymm3
+	WORD $0x438b; BYTE $0xec       // mov    eax, dword [rbx - 20]
+	LONG $0xf0538b44               // mov    r10d, dword [rbx - 16]
+	LONG $0xc2a40f41; BYTE $0x02   // shld    r10d, eax, 2
+	WORD $0x538b; BYTE $0xe8       // mov    edx, dword [rbx - 24]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x06d6a40f               // shld    esi, edx, 6
+	LONG $0xe04b8b44               // mov    r9d, dword [rbx - 32]
+	WORD $0x4b8b; BYTE $0xe4       // mov    ecx, dword [rbx - 28]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0xcfa40f44; BYTE $0x0e   // shld    edi, r9d, 14
+	LONG $0xda6ef9c5               // vmovd    xmm3, edx
+	LONG $0x0acaa40f               // shld    edx, ecx, 10
+	LONG $0x2261e3c4; WORD $0x01de // vpinsrd    xmm3, xmm3, esi, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2261c3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, r10d, 3
+	LONG $0x6e79c1c4; BYTE $0xe1   // vmovd    xmm4, r9d
+	LONG $0x2259e3c4; WORD $0x01e7 // vpinsrd    xmm4, xmm4, edi, 1
+	LONG $0x2259e3c4; WORD $0x02e1 // vpinsrd    xmm4, xmm4, ecx, 2
+	LONG $0x2259e3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, edx, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe05f // vmovdqu    yword [r15 - 32], ymm3
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	LONG $0xfc5b8b44               // mov    r11d, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0xdaa40f44; BYTE $0x04   // shld    edx, r11d, 4
+	LONG $0xf8538b44               // mov    r10d, dword [rbx - 8]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0xd7a40f44; BYTE $0x08   // shld    edi, r10d, 8
+	WORD $0x438b; BYTE $0xf0       // mov    eax, dword [rbx - 16]
+	WORD $0x738b; BYTE $0xf4       // mov    esi, dword [rbx - 12]
+	WORD $0x8944; BYTE $0xd1       // mov    ecx, r10d
+	LONG $0x0cf1a40f               // shld    ecx, esi, 12
+	LONG $0x10f0ac0f               // shrd    eax, esi, 16
+	LONG $0xdf6ef9c5               // vmovd    xmm3, edi
+	LONG $0x2261c3c4; WORD $0x01db // vpinsrd    xmm3, xmm3, r11d, 1
+	LONG $0x2261e3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, edx, 2
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0xe06ef9c5               // vmovd    xmm4, eax
+	LONG $0x2259e3c4; WORD $0x01e6 // vpinsrd    xmm4, xmm4, esi, 1
+	LONG $0x2259e3c4; WORD $0x02e1 // vpinsrd    xmm4, xmm4, ecx, 2
+	LONG $0x2259c3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, r10d, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x1f   // vmovdqu    yword [r15], ymm3
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x48c38348               // add    rbx, 72
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_119
+	JMP  LBB0_147
+
+LBB0_129:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x24c38348                     // add    rbx, 36
+	QUAD $0x000007e0856ffdc5             // vmovdqa    ymm0, yword 2016[rbp] /* [rip + .LCPI0_101] */
+	QUAD $0x0000b08d597de2c4; BYTE $0x00 // vpbroadcastq    ymm1, qword 176[rbp] /* [rip + .LCPI0_102] */
+	QUAD $0x00000800956ffdc5             // vmovdqa    ymm2, yword 2048[rbp] /* [rip + .LCPI0_103] */
+
+LBB0_131:
+	WORD $0x4b8b; BYTE $0xe4       // mov    ecx, dword [rbx - 28]
+	WORD $0x538b; BYTE $0xdc       // mov    edx, dword [rbx - 36]
+	WORD $0x738b; BYTE $0xe0       // mov    esi, dword [rbx - 32]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x04f7a40f               // shld    edi, esi, 4
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2261e3c4; WORD $0x01de // vpinsrd    xmm3, xmm3, esi, 1
+	LONG $0x02d6a40f               // shld    esi, edx, 2
+	LONG $0x2261e3c4; WORD $0x02df // vpinsrd    xmm3, xmm3, edi, 2
+	LONG $0x2261e3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, ecx, 3
+	LONG $0xe26ef9c5               // vmovd    xmm4, edx
+	LONG $0x2259e3c4; WORD $0x01e2 // vpinsrd    xmm4, xmm4, edx, 1
+	LONG $0x2259e3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, edx, 2
+	LONG $0x2259e3c4; WORD $0x03e6 // vpinsrd    xmm4, xmm4, esi, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa05f // vmovdqu    yword [r15 - 96], ymm3
+	WORD $0x4b8b; BYTE $0xec       // mov    ecx, dword [rbx - 20]
+	WORD $0x538b; BYTE $0xe8       // mov    edx, dword [rbx - 24]
+	WORD $0xce89                   // mov    esi, ecx
+	LONG $0x08d6a40f               // shld    esi, edx, 8
+	WORD $0x7b8b; BYTE $0xe4       // mov    edi, dword [rbx - 28]
+	WORD $0xd089                   // mov    eax, edx
+	LONG $0x06f8a40f               // shld    eax, edi, 6
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02d9 // vpinsrd    xmm3, xmm3, ecx, 2
+	LONG $0x2261e3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, ecx, 3
+	LONG $0xe76ef9c5               // vmovd    xmm4, edi
+	LONG $0x2259e3c4; WORD $0x01e0 // vpinsrd    xmm4, xmm4, eax, 1
+	LONG $0x2259e3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, edx, 2
+	LONG $0x2259e3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, edx, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc05f // vmovdqu    yword [r15 - 64], ymm3
+	WORD $0x438b; BYTE $0xf8       // mov    eax, dword [rbx - 8]
+	WORD $0x4b8b; BYTE $0xf0       // mov    ecx, dword [rbx - 16]
+	WORD $0x538b; BYTE $0xf4       // mov    edx, dword [rbx - 12]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x04d6a40f               // shld    esi, edx, 4
+	LONG $0xda6ef9c5               // vmovd    xmm3, edx
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0x02caa40f               // shld    edx, ecx, 2
+	LONG $0x2261e3c4; WORD $0x02de // vpinsrd    xmm3, xmm3, esi, 2
+	LONG $0x2261e3c4; WORD $0x03d8 // vpinsrd    xmm3, xmm3, eax, 3
+	LONG $0xe16ef9c5               // vmovd    xmm4, ecx
+	LONG $0x2259e3c4; WORD $0x01e1 // vpinsrd    xmm4, xmm4, ecx, 1
+	LONG $0x2259e3c4; WORD $0x02e1 // vpinsrd    xmm4, xmm4, ecx, 2
+	LONG $0x2259e3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, edx, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe05f // vmovdqu    yword [r15 - 32], ymm3
+	WORD $0x038b                   // mov    eax, dword [rbx]
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	WORD $0x538b; BYTE $0xfc       // mov    edx, dword [rbx - 4]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x08d6a40f               // shld    esi, edx, 8
+	WORD $0xd789                   // mov    edi, edx
+	LONG $0x06cfa40f               // shld    edi, ecx, 6
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x2261e3c4; WORD $0x01d8 // vpinsrd    xmm3, xmm3, eax, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2261e3c4; WORD $0x03d8 // vpinsrd    xmm3, xmm3, eax, 3
+	LONG $0xe16ef9c5               // vmovd    xmm4, ecx
+	LONG $0x2259e3c4; WORD $0x01e7 // vpinsrd    xmm4, xmm4, edi, 1
+	LONG $0x2259e3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, edx, 2
+	LONG $0x2259e3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, edx, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x1f   // vmovdqu    yword [r15], ymm3
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x28c38348               // add    rbx, 40
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_131
+	JMP  LBB0_147
+
+LBB0_105:
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x64c38348               // add    rbx, 100
+	LONG $0x597de2c4; WORD $0x2845 // vpbroadcastq    ymm0, qword 40[rbp] /* [rip + .LCPI0_34] */
+	LONG $0x5979e2c4; WORD $0x304d // vpbroadcastq    xmm1, qword 48[rbp] /* [rip + .LCPI0_35] */
+	QUAD $0x000002c0956ffdc5       // vmovdqa    ymm2, yword 704[rbp] /* [rip + .LCPI0_33] */
+	LONG $0x5979e2c4; WORD $0x385d // vpbroadcastq    xmm3, qword 56[rbp] /* [rip + .LCPI0_36] */
+	QUAD $0x000002e0a56ffdc5       // vmovdqa    ymm4, yword 736[rbp] /* [rip + .LCPI0_37] */
+
+LBB0_107:
+	WORD $0x4b8b; BYTE $0xb0       // mov    ecx, dword [rbx - 80]
+	LONG $0xb44b8b44               // mov    r9d, dword [rbx - 76]
+	LONG $0xc9a40f41; BYTE $0x0a   // shld    r9d, ecx, 10
+	LONG $0xac5b8b44               // mov    r11d, dword [rbx - 84]
+	LONG $0xd9a40f44; BYTE $0x04   // shld    ecx, r11d, 4
+	WORD $0x7b8b; BYTE $0xa8       // mov    edi, dword [rbx - 88]
+	WORD $0x8944; BYTE $0xde       // mov    esi, r11d
+	LONG $0x18fea40f               // shld    esi, edi, 24
+	WORD $0x538b; BYTE $0xa4       // mov    edx, dword [rbx - 92]
+	LONG $0x12d7a40f               // shld    edi, edx, 18
+	LONG $0x9c538b44               // mov    r10d, dword [rbx - 100]
+	WORD $0x438b; BYTE $0xa0       // mov    eax, dword [rbx - 96]
+	LONG $0x0cc2a40f               // shld    edx, eax, 12
+	LONG $0xd0a40f44; BYTE $0x06   // shld    eax, r10d, 6
+	LONG $0x6e79c1c4; BYTE $0xea   // vmovd    xmm5, r10d
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2251e3c4; WORD $0x01e8 // vpinsrd    xmm5, xmm5, eax, 1
+	LONG $0x2249c3c4; WORD $0x01f3 // vpinsrd    xmm6, xmm6, r11d, 1
+	LONG $0x2251e3c4; WORD $0x02ea // vpinsrd    xmm5, xmm5, edx, 2
+	LONG $0x2249e3c4; WORD $0x02f1 // vpinsrd    xmm6, xmm6, ecx, 2
+	LONG $0x2251e3c4; WORD $0x03ef // vpinsrd    xmm5, xmm5, edi, 3
+	LONG $0x2249c3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, r9d, 3
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa06f // vmovdqu    yword [r15 - 96], ymm5
+	LONG $0xcc4b8b44               // mov    r9d, dword [rbx - 52]
+	WORD $0x4b8b; BYTE $0xc8       // mov    ecx, dword [rbx - 56]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x14caa40f               // shld    edx, ecx, 20
+	WORD $0x738b; BYTE $0xc4       // mov    esi, dword [rbx - 60]
+	LONG $0x0ef1a40f               // shld    ecx, esi, 14
+	WORD $0x7b8b; BYTE $0xbc       // mov    edi, dword [rbx - 68]
+	WORD $0x438b; BYTE $0xc0       // mov    eax, dword [rbx - 64]
+	LONG $0x08c6a40f               // shld    esi, eax, 8
+	LONG $0x02f8a40f               // shld    eax, edi, 2
+	LONG $0x6b7efac5; BYTE $0xb4   // vmovq    xmm5, qword [rbx - 76]
+	LONG $0x4551e2c4; BYTE $0xf1   // vpsrlvd    xmm6, xmm5, xmm1
+	LONG $0xed70f9c5; BYTE $0xe5   // vpshufd    xmm5, xmm5, 229
+	LONG $0x2251e3c4; WORD $0x01ef // vpinsrd    xmm5, xmm5, edi, 1
+	LONG $0x4751e2c4; BYTE $0xeb   // vpsllvd    xmm5, xmm5, xmm3
+	LONG $0xedebc9c5               // vpor    xmm5, xmm6, xmm5
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249c3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, r9d, 3
+	LONG $0x2251e3c4; WORD $0x02ef // vpinsrd    xmm5, xmm5, edi, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc06f // vmovdqu    yword [r15 - 64], ymm5
+	WORD $0x438b; BYTE $0xe4       // mov    eax, dword [rbx - 28]
+	LONG $0xe84b8b44               // mov    r9d, dword [rbx - 24]
+	LONG $0xc1a40f41; BYTE $0x0a   // shld    r9d, eax, 10
+	LONG $0xe05b8b44               // mov    r11d, dword [rbx - 32]
+	LONG $0xd8a40f44; BYTE $0x04   // shld    eax, r11d, 4
+	WORD $0x738b; BYTE $0xdc       // mov    esi, dword [rbx - 36]
+	WORD $0x8944; BYTE $0xdf       // mov    edi, r11d
+	LONG $0x18f7a40f               // shld    edi, esi, 24
+	WORD $0x4b8b; BYTE $0xd8       // mov    ecx, dword [rbx - 40]
+	LONG $0x12cea40f               // shld    esi, ecx, 18
+	LONG $0xd0538b44               // mov    r10d, dword [rbx - 48]
+	WORD $0x538b; BYTE $0xd4       // mov    edx, dword [rbx - 44]
+	LONG $0x0cd1a40f               // shld    ecx, edx, 12
+	LONG $0xd2a40f44; BYTE $0x06   // shld    edx, r10d, 6
+	LONG $0x6e79c1c4; BYTE $0xea   // vmovd    xmm5, r10d
+	LONG $0xf76ef9c5               // vmovd    xmm6, edi
+	LONG $0x2251e3c4; WORD $0x01ea // vpinsrd    xmm5, xmm5, edx, 1
+	LONG $0x2249c3c4; WORD $0x01f3 // vpinsrd    xmm6, xmm6, r11d, 1
+	LONG $0x2251e3c4; WORD $0x02e9 // vpinsrd    xmm5, xmm5, ecx, 2
+	LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd    xmm6, xmm6, eax, 2
+	LONG $0x2251e3c4; WORD $0x03ee // vpinsrd    xmm5, xmm5, esi, 3
+	LONG $0x2249c3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, r9d, 3
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xea   // vpsrlvd    ymm5, ymm5, ymm2
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe06f // vmovdqu    yword [r15 - 32], ymm5
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x14caa40f               // shld    edx, ecx, 20
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0x0ef1a40f               // shld    ecx, esi, 14
+	WORD $0x7b8b; BYTE $0xf0       // mov    edi, dword [rbx - 16]
+	WORD $0x438b; BYTE $0xf4       // mov    eax, dword [rbx - 12]
+	LONG $0x08c6a40f               // shld    esi, eax, 8
+	LONG $0x02f8a40f               // shld    eax, edi, 2
+	LONG $0x6b7efac5; BYTE $0xe8   // vmovq    xmm5, qword [rbx - 24]
+	LONG $0x4551e2c4; BYTE $0xf1   // vpsrlvd    xmm6, xmm5, xmm1
+	LONG $0xed70f9c5; BYTE $0xe5   // vpshufd    xmm5, xmm5, 229
+	LONG $0x2251e3c4; WORD $0x01ef // vpinsrd    xmm5, xmm5, edi, 1
+	LONG $0x4751e2c4; BYTE $0xeb   // vpsllvd    xmm5, xmm5, xmm3
+	LONG $0xedebc9c5               // vpor    xmm5, xmm6, xmm5
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249c3c4; WORD $0x03f1 // vpinsrd    xmm6, xmm6, r9d, 3
+	LONG $0x2251e3c4; WORD $0x02ef // vpinsrd    xmm5, xmm5, edi, 2
+	LONG $0x2251e3c4; WORD $0x03e8 // vpinsrd    xmm5, xmm5, eax, 3
+	LONG $0x3855e3c4; WORD $0x01ee // vinserti128    ymm5, ymm5, xmm6, 1
+	LONG $0x4555e2c4; BYTE $0xec   // vpsrlvd    ymm5, ymm5, ymm4
+	LONG $0xe8dbd5c5               // vpand    ymm5, ymm5, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x2f   // vmovdqu    yword [r15], ymm5
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x68c38348               // add    rbx, 104
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_107
+	JMP  LBB0_147
+
+LBB0_135:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8944; BYTE $0xf0             // mov    eax, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x14c38348                     // add    rbx, 20
+	QUAD $0x00000920856ffdc5             // vmovdqa    ymm0, yword 2336[rbp] /* [rip + .LCPI0_116] */
+	QUAD $0x0000c88d597de2c4; BYTE $0x00 // vpbroadcastq    ymm1, qword 200[rbp] /* [rip + .LCPI0_117] */
+	QUAD $0x00000940956ffdc5             // vmovdqa    ymm2, yword 2368[rbp] /* [rip + .LCPI0_118] */
+
+LBB0_137:
+	WORD $0x4b8b; BYTE $0xec       // mov    ecx, dword [rbx - 20]
+	WORD $0x538b; BYTE $0xf0       // mov    edx, dword [rbx - 16]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x02cea40f               // shld    esi, ecx, 2
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x5879e2c4; BYTE $0xe3   // vpbroadcastd    xmm4, xmm3
+	LONG $0x2261e3c4; WORD $0x01de // vpinsrd    xmm3, xmm3, esi, 1
+	LONG $0x2261e3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, edx, 2
+	LONG $0x2261e3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, edx, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa05f // vmovdqu    yword [r15 - 96], ymm3
+	WORD $0x4b8b; BYTE $0xf0       // mov    ecx, dword [rbx - 16]
+	WORD $0x538b; BYTE $0xf4       // mov    edx, dword [rbx - 12]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02de // vpinsrd    xmm3, xmm3, esi, 2
+	LONG $0x2261e3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, edx, 3
+	LONG $0xe26ef9c5               // vmovd    xmm4, edx
+	LONG $0x5879e2c4; BYTE $0xe4   // vpbroadcastd    xmm4, xmm4
+	LONG $0x3865e3c4; WORD $0x01dc // vinserti128    ymm3, ymm3, xmm4, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc05f // vmovdqu    yword [r15 - 64], ymm3
+	WORD $0x4b8b; BYTE $0xf8       // mov    ecx, dword [rbx - 8]
+	WORD $0x538b; BYTE $0xfc       // mov    edx, dword [rbx - 4]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x02cea40f               // shld    esi, ecx, 2
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x2261e3c4; WORD $0x01e6 // vpinsrd    xmm4, xmm3, esi, 1
+	LONG $0x2259e3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, edx, 2
+	LONG $0x5879e2c4; BYTE $0xdb   // vpbroadcastd    xmm3, xmm3
+	LONG $0x2259e3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, edx, 3
+	LONG $0x3865e3c4; WORD $0x01dc // vinserti128    ymm3, ymm3, xmm4, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe05f // vmovdqu    yword [r15 - 32], ymm3
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x138b                   // mov    edx, dword [rbx]
+	WORD $0xd689                   // mov    esi, edx
+	LONG $0x04cea40f               // shld    esi, ecx, 4
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02de // vpinsrd    xmm3, xmm3, esi, 2
+	LONG $0x2261e3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, edx, 3
+	LONG $0xe26ef9c5               // vmovd    xmm4, edx
+	LONG $0x5879e2c4; BYTE $0xe4   // vpbroadcastd    xmm4, xmm4
+	LONG $0x3865e3c4; WORD $0x01dc // vinserti128    ymm3, ymm3, xmm4, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x1f   // vmovdqu    yword [r15], ymm3
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x18c38348               // add    rbx, 24
+	LONG $0xffc08348               // add    rax, -1
+	JNE  LBB0_137
+	JMP  LBB0_147
+
+LBB0_111:
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x54c38348               // add    rbx, 84
+	QUAD $0x000003e0856ffdc5       // vmovdqa    ymm0, yword 992[rbp] /* [rip + .LCPI0_55] */
+	LONG $0x597de2c4; WORD $0x504d // vpbroadcastq    ymm1, qword 80[rbp] /* [rip + .LCPI0_56] */
+	QUAD $0x00000400956ffdc5       // vmovdqa    ymm2, yword 1024[rbp] /* [rip + .LCPI0_57] */
+
+LBB0_113:
+	LONG $0xbc538b44               // mov    r10d, dword [rbx - 68]
+	LONG $0xc04b8b44               // mov    r9d, dword [rbx - 64]
+	LONG $0xd1a40f45; BYTE $0x06   // shld    r9d, r10d, 6
+	WORD $0x738b; BYTE $0xb8       // mov    esi, dword [rbx - 72]
+	WORD $0x8944; BYTE $0xd7       // mov    edi, r10d
+	LONG $0x12f7a40f               // shld    edi, esi, 18
+	WORD $0x538b; BYTE $0xb4       // mov    edx, dword [rbx - 76]
+	LONG $0x08d6a40f               // shld    esi, edx, 8
+	LONG $0xac5b8b44               // mov    r11d, dword [rbx - 84]
+	WORD $0x4b8b; BYTE $0xb0       // mov    ecx, dword [rbx - 80]
+	WORD $0xd089                   // mov    eax, edx
+	LONG $0x14c8a40f               // shld    eax, ecx, 20
+	LONG $0xd9a40f44; BYTE $0x0a   // shld    ecx, r11d, 10
+	LONG $0x6e79c1c4; BYTE $0xdb   // vmovd    xmm3, r11d
+	LONG $0xe66ef9c5               // vmovd    xmm4, esi
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2259e3c4; WORD $0x01e7 // vpinsrd    xmm4, xmm4, edi, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2259c3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, r10d, 2
+	LONG $0x2261e3c4; WORD $0x03da // vpinsrd    xmm3, xmm3, edx, 3
+	LONG $0x2259c3c4; WORD $0x03e1 // vpinsrd    xmm4, xmm4, r9d, 3
+	LONG $0x3865e3c4; WORD $0x01dc // vinserti128    ymm3, ymm3, xmm4, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa05f // vmovdqu    yword [r15 - 96], ymm3
+	LONG $0xd44b8b44               // mov    r9d, dword [rbx - 44]
+	WORD $0x4b8b; BYTE $0xd0       // mov    ecx, dword [rbx - 48]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xcaa40f41; BYTE $0x0c   // shld    r10d, ecx, 12
+	WORD $0x738b; BYTE $0xcc       // mov    esi, dword [rbx - 52]
+	LONG $0x02f1a40f               // shld    ecx, esi, 2
+	WORD $0x7b8b; BYTE $0xc8       // mov    edi, dword [rbx - 56]
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x0efea40f               // shld    esi, edi, 14
+	WORD $0x438b; BYTE $0xc0       // mov    eax, dword [rbx - 64]
+	WORD $0x538b; BYTE $0xc4       // mov    edx, dword [rbx - 60]
+	LONG $0x04d7a40f               // shld    edi, edx, 4
+	LONG $0x10d0ac0f               // shrd    eax, edx, 16
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0xe06ef9c5               // vmovd    xmm4, eax
+	LONG $0x2261c3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, r10d, 2
+	LONG $0x2259e3c4; WORD $0x01e2 // vpinsrd    xmm4, xmm4, edx, 1
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x2259e3c4; WORD $0x02e7 // vpinsrd    xmm4, xmm4, edi, 2
+	LONG $0x2259e3c4; WORD $0x03e6 // vpinsrd    xmm4, xmm4, esi, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc05f // vmovdqu    yword [r15 - 64], ymm3
+	LONG $0xe8538b44               // mov    r10d, dword [rbx - 24]
+	LONG $0xec4b8b44               // mov    r9d, dword [rbx - 20]
+	LONG $0xd1a40f45; BYTE $0x06   // shld    r9d, r10d, 6
+	WORD $0x538b; BYTE $0xe4       // mov    edx, dword [rbx - 28]
+	WORD $0x8944; BYTE $0xd6       // mov    esi, r10d
+	LONG $0x12d6a40f               // shld    esi, edx, 18
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	LONG $0x08caa40f               // shld    edx, ecx, 8
+	LONG $0xd85b8b44               // mov    r11d, dword [rbx - 40]
+	WORD $0x438b; BYTE $0xdc       // mov    eax, dword [rbx - 36]
+	WORD $0xcf89                   // mov    edi, ecx
+	LONG $0x14c7a40f               // shld    edi, eax, 20
+	LONG $0xd8a40f44; BYTE $0x0a   // shld    eax, r11d, 10
+	LONG $0x6e79c1c4; BYTE $0xdb   // vmovd    xmm3, r11d
+	LONG $0xe26ef9c5               // vmovd    xmm4, edx
+	LONG $0x2261e3c4; WORD $0x01d8 // vpinsrd    xmm3, xmm3, eax, 1
+	LONG $0x2259e3c4; WORD $0x01e6 // vpinsrd    xmm4, xmm4, esi, 1
+	LONG $0x2261e3c4; WORD $0x02df // vpinsrd    xmm3, xmm3, edi, 2
+	LONG $0x2259c3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, r10d, 2
+	LONG $0x2261e3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, ecx, 3
+	LONG $0x2259c3c4; WORD $0x03e1 // vpinsrd    xmm4, xmm4, r9d, 3
+	LONG $0x3865e3c4; WORD $0x01dc // vinserti128    ymm3, ymm3, xmm4, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe05f // vmovdqu    yword [r15 - 32], ymm3
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8945; BYTE $0xca       // mov    r10d, r9d
+	LONG $0xcaa40f41; BYTE $0x0c   // shld    r10d, ecx, 12
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0x02f1a40f               // shld    ecx, esi, 2
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	LONG $0xde6ef9c5               // vmovd    xmm3, esi
+	LONG $0x0efea40f               // shld    esi, edi, 14
+	WORD $0x438b; BYTE $0xec       // mov    eax, dword [rbx - 20]
+	WORD $0x538b; BYTE $0xf0       // mov    edx, dword [rbx - 16]
+	LONG $0x04d7a40f               // shld    edi, edx, 4
+	LONG $0x10d0ac0f               // shrd    eax, edx, 16
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0xe06ef9c5               // vmovd    xmm4, eax
+	LONG $0x2261c3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, r10d, 2
+	LONG $0x2259e3c4; WORD $0x01e2 // vpinsrd    xmm4, xmm4, edx, 1
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x2259e3c4; WORD $0x02e7 // vpinsrd    xmm4, xmm4, edi, 2
+	LONG $0x2259e3c4; WORD $0x03e6 // vpinsrd    xmm4, xmm4, esi, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x1f   // vmovdqu    yword [r15], ymm3
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x58c38348               // add    rbx, 88
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_113
+	JMP  LBB0_147
+
+LBB0_123:
+	WORD $0xfa83; BYTE $0x20             // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0             // mov    r8d, r14d
+	LONG $0x60c78349                     // add    r15, 96
+	LONG $0x34c38348                     // add    rbx, 52
+	QUAD $0x00000680856ffdc5             // vmovdqa    ymm0, yword 1664[rbp] /* [rip + .LCPI0_86] */
+	QUAD $0x0000908d597de2c4; BYTE $0x00 // vpbroadcastq    ymm1, qword 144[rbp] /* [rip + .LCPI0_87] */
+	QUAD $0x000006a0956ffdc5             // vmovdqa    ymm2, yword 1696[rbp] /* [rip + .LCPI0_88] */
+
+LBB0_125:
+	LONG $0xd84b8b44               // mov    r9d, dword [rbx - 40]
+	WORD $0x4b8b; BYTE $0xd4       // mov    ecx, dword [rbx - 44]
+	WORD $0x8944; BYTE $0xce       // mov    esi, r9d
+	LONG $0x0ccea40f               // shld    esi, ecx, 12
+	WORD $0x7b8b; BYTE $0xcc       // mov    edi, dword [rbx - 52]
+	LONG $0xd0538b44               // mov    r10d, dword [rbx - 48]
+	WORD $0xca89                   // mov    edx, ecx
+	LONG $0xd2a40f44; BYTE $0x08   // shld    edx, r10d, 8
+	WORD $0x8944; BYTE $0xd0       // mov    eax, r10d
+	LONG $0x04f8a40f               // shld    eax, edi, 4
+	LONG $0xda6ef9c5               // vmovd    xmm3, edx
+	LONG $0x2261e3c4; WORD $0x01d9 // vpinsrd    xmm3, xmm3, ecx, 1
+	LONG $0x2261e3c4; WORD $0x02de // vpinsrd    xmm3, xmm3, esi, 2
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0xe76ef9c5               // vmovd    xmm4, edi
+	LONG $0x2259e3c4; WORD $0x01e7 // vpinsrd    xmm4, xmm4, edi, 1
+	LONG $0x2259e3c4; WORD $0x02e0 // vpinsrd    xmm4, xmm4, eax, 2
+	LONG $0x2259c3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, r10d, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xa05f // vmovdqu    yword [r15 - 96], ymm3
+	WORD $0x438b; BYTE $0xe4       // mov    eax, dword [rbx - 28]
+	WORD $0x4b8b; BYTE $0xe0       // mov    ecx, dword [rbx - 32]
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0x0acaa40f               // shld    edx, ecx, 10
+	LONG $0xd84b8b44               // mov    r9d, dword [rbx - 40]
+	WORD $0x738b; BYTE $0xdc       // mov    esi, dword [rbx - 36]
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x06f1a40f               // shld    ecx, esi, 6
+	WORD $0xf789                   // mov    edi, esi
+	LONG $0xcfa40f44; BYTE $0x02   // shld    edi, r9d, 2
+	LONG $0x6e79c1c4; BYTE $0xe1   // vmovd    xmm4, r9d
+	LONG $0x2259e3c4; WORD $0x01e7 // vpinsrd    xmm4, xmm4, edi, 1
+	LONG $0x2259e3c4; WORD $0x02e6 // vpinsrd    xmm4, xmm4, esi, 2
+	LONG $0x2259e3c4; WORD $0x03e1 // vpinsrd    xmm4, xmm4, ecx, 3
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0x2261e3c4; WORD $0x02d8 // vpinsrd    xmm3, xmm3, eax, 2
+	LONG $0x2261e3c4; WORD $0x03d8 // vpinsrd    xmm3, xmm3, eax, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xc05f // vmovdqu    yword [r15 - 64], ymm3
+	LONG $0xf44b8b44               // mov    r9d, dword [rbx - 12]
+	WORD $0x438b; BYTE $0xf0       // mov    eax, dword [rbx - 16]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x0cc2a40f               // shld    edx, eax, 12
+	WORD $0x738b; BYTE $0xe8       // mov    esi, dword [rbx - 24]
+	LONG $0xec538b44               // mov    r10d, dword [rbx - 20]
+	WORD $0xc189                   // mov    ecx, eax
+	LONG $0xd1a40f44; BYTE $0x08   // shld    ecx, r10d, 8
+	WORD $0x8944; BYTE $0xd7       // mov    edi, r10d
+	LONG $0x04f7a40f               // shld    edi, esi, 4
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x2261e3c4; WORD $0x01d8 // vpinsrd    xmm3, xmm3, eax, 1
+	LONG $0x2261e3c4; WORD $0x02da // vpinsrd    xmm3, xmm3, edx, 2
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0xe66ef9c5               // vmovd    xmm4, esi
+	LONG $0x2259e3c4; WORD $0x01e6 // vpinsrd    xmm4, xmm4, esi, 1
+	LONG $0x2259e3c4; WORD $0x02e7 // vpinsrd    xmm4, xmm4, edi, 2
+	LONG $0x2259c3c4; WORD $0x03e2 // vpinsrd    xmm4, xmm4, r10d, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xd8   // vpsrlvd    ymm3, ymm3, ymm0
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; WORD $0xe05f // vmovdqu    yword [r15 - 32], ymm3
+	WORD $0x8b44; BYTE $0x0b       // mov    r9d, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0x8944; BYTE $0xca       // mov    edx, r9d
+	LONG $0x0acaa40f               // shld    edx, ecx, 10
+	WORD $0x438b; BYTE $0xf8       // mov    eax, dword [rbx - 8]
+	LONG $0xd96ef9c5               // vmovd    xmm3, ecx
+	LONG $0x06c1a40f               // shld    ecx, eax, 6
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	WORD $0xc689                   // mov    esi, eax
+	LONG $0x02fea40f               // shld    esi, edi, 2
+	LONG $0xe76ef9c5               // vmovd    xmm4, edi
+	LONG $0x2259e3c4; WORD $0x01e6 // vpinsrd    xmm4, xmm4, esi, 1
+	LONG $0x2259e3c4; WORD $0x02e0 // vpinsrd    xmm4, xmm4, eax, 2
+	LONG $0x2259e3c4; WORD $0x03e1 // vpinsrd    xmm4, xmm4, ecx, 3
+	LONG $0x2261e3c4; WORD $0x01da // vpinsrd    xmm3, xmm3, edx, 1
+	LONG $0x2261c3c4; WORD $0x02d9 // vpinsrd    xmm3, xmm3, r9d, 2
+	LONG $0x2261c3c4; WORD $0x03d9 // vpinsrd    xmm3, xmm3, r9d, 3
+	LONG $0x385de3c4; WORD $0x01db // vinserti128    ymm3, ymm4, xmm3, 1
+	LONG $0x4565e2c4; BYTE $0xda   // vpsrlvd    ymm3, ymm3, ymm2
+	LONG $0xd9dbe5c5               // vpand    ymm3, ymm3, ymm1
+	LONG $0x7f7ec1c4; BYTE $0x1f   // vmovdqu    yword [r15], ymm3
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x38c38348               // add    rbx, 56
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_125
+	JMP  LBB0_147
+
+LBB0_99:
+	WORD $0xfa83; BYTE $0x20       // cmp    edx, 32
+	JL   LBB0_147
+	WORD $0x8945; BYTE $0xf0       // mov    r8d, r14d
+	LONG $0x60c78349               // add    r15, 96
+	LONG $0x597de2c4; WORD $0x0845 // vpbroadcastq    ymm0, qword 8[rbp] /* [rip + .LCPI0_8] */
+	LONG $0x74c38348               // add    rbx, 116
+	QUAD $0x00000b208d6ff9c5       // vmovdqa    xmm1, oword 2848[rbp] /* [rip + .LCPI0_9] */
+	QUAD $0x00000b30956ff9c5       // vmovdqa    xmm2, oword 2864[rbp] /* [rip + .LCPI0_10] */
+	QUAD $0x000001a09d6ffdc5       // vmovdqa    ymm3, yword 416[rbp] /* [rip + .LCPI0_11] */
+
+LBB0_101:
+	LONG $0xa45b8b44               // mov    r11d, dword [rbx - 92]
+	LONG $0xa84b8b44               // mov    r9d, dword [rbx - 88]
+	LONG $0xd9a40f45; BYTE $0x0e   // shld    r9d, r11d, 14
+	WORD $0x738b; BYTE $0xa0       // mov    esi, dword [rbx - 96]
+	LONG $0xf3a40f41; BYTE $0x0c   // shld    r11d, esi, 12
+	WORD $0x7b8b; BYTE $0x9c       // mov    edi, dword [rbx - 100]
+	LONG $0x0afea40f               // shld    esi, edi, 10
+	WORD $0x438b; BYTE $0x98       // mov    eax, dword [rbx - 104]
+	LONG $0x08c7a40f               // shld    edi, eax, 8
+	WORD $0x538b; BYTE $0x94       // mov    edx, dword [rbx - 108]
+	LONG $0x06d0a40f               // shld    eax, edx, 6
+	LONG $0x8c538b44               // mov    r10d, dword [rbx - 116]
+	WORD $0x4b8b; BYTE $0x90       // mov    ecx, dword [rbx - 112]
+	LONG $0x04caa40f               // shld    edx, ecx, 4
+	LONG $0xd1a40f44; BYTE $0x02   // shld    ecx, r10d, 2
+	LONG $0x6e79c1c4; BYTE $0xe2   // vmovd    xmm4, r10d
+	LONG $0xef6ef9c5               // vmovd    xmm5, edi
+	LONG $0x2259e3c4; WORD $0x01e1 // vpinsrd    xmm4, xmm4, ecx, 1
+	LONG $0x2251e3c4; WORD $0x01ee // vpinsrd    xmm5, xmm5, esi, 1
+	LONG $0x2259e3c4; WORD $0x02e2 // vpinsrd    xmm4, xmm4, edx, 2
+	LONG $0x2251c3c4; WORD $0x02eb // vpinsrd    xmm5, xmm5, r11d, 2
+	LONG $0x2259e3c4; WORD $0x03e0 // vpinsrd    xmm4, xmm4, eax, 3
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0x385de3c4; WORD $0x01e5 // vinserti128    ymm4, ymm4, xmm5, 1
+	LONG $0xe0dbddc5               // vpand    ymm4, ymm4, ymm0
+	LONG $0x7f7ec1c4; WORD $0xa067 // vmovdqu    yword [r15 - 96], ymm4
+	WORD $0x438b; BYTE $0xc4       // mov    eax, dword [rbx - 60]
+	WORD $0x4b8b; BYTE $0xc0       // mov    ecx, dword [rbx - 64]
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0x1ccaa40f               // shld    edx, ecx, 28
+	WORD $0x738b; BYTE $0xbc       // mov    esi, dword [rbx - 68]
+	WORD $0x7b8b; BYTE $0xb8       // mov    edi, dword [rbx - 72]
+	LONG $0x1af1a40f               // shld    ecx, esi, 26
+	LONG $0x18fea40f               // shld    esi, edi, 24
+	LONG $0x636ffac5; BYTE $0xa8   // vmovdqu    xmm4, oword [rbx - 88]
+	LONG $0x4559e2c4; BYTE $0xe9   // vpsrlvd    xmm5, xmm4, xmm1
+	LONG $0xe470f9c5; BYTE $0xf9   // vpshufd    xmm4, xmm4, 249
+	LONG $0x2259e3c4; WORD $0x03e7 // vpinsrd    xmm4, xmm4, edi, 3
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x4759e2c4; BYTE $0xe2   // vpsllvd    xmm4, xmm4, xmm2
+	LONG $0x2249e3c4; WORD $0x03f0 // vpinsrd    xmm6, xmm6, eax, 3
+	LONG $0xe4ebd1c5               // vpor    xmm4, xmm5, xmm4
+	LONG $0x385de3c4; WORD $0x01e6 // vinserti128    ymm4, ymm4, xmm6, 1
+	LONG $0x455de2c4; BYTE $0xe3   // vpsrlvd    ymm4, ymm4, ymm3
+	LONG $0xe0dbddc5               // vpand    ymm4, ymm4, ymm0
+	LONG $0x7f7ec1c4; WORD $0xc067 // vmovdqu    yword [r15 - 64], ymm4
+	LONG $0xe05b8b44               // mov    r11d, dword [rbx - 32]
+	LONG $0xe44b8b44               // mov    r9d, dword [rbx - 28]
+	LONG $0xd9a40f45; BYTE $0x0e   // shld    r9d, r11d, 14
+	WORD $0x538b; BYTE $0xdc       // mov    edx, dword [rbx - 36]
+	LONG $0xd3a40f41; BYTE $0x0c   // shld    r11d, edx, 12
+	WORD $0x738b; BYTE $0xd8       // mov    esi, dword [rbx - 40]
+	LONG $0x0af2a40f               // shld    edx, esi, 10
+	WORD $0x7b8b; BYTE $0xd4       // mov    edi, dword [rbx - 44]
+	LONG $0x08fea40f               // shld    esi, edi, 8
+	WORD $0x4b8b; BYTE $0xd0       // mov    ecx, dword [rbx - 48]
+	LONG $0x06cfa40f               // shld    edi, ecx, 6
+	LONG $0xc8538b44               // mov    r10d, dword [rbx - 56]
+	WORD $0x438b; BYTE $0xcc       // mov    eax, dword [rbx - 52]
+	LONG $0x04c1a40f               // shld    ecx, eax, 4
+	LONG $0xd0a40f44; BYTE $0x02   // shld    eax, r10d, 2
+	LONG $0x6e79c1c4; BYTE $0xe2   // vmovd    xmm4, r10d
+	LONG $0xee6ef9c5               // vmovd    xmm5, esi
+	LONG $0x2259e3c4; WORD $0x01e0 // vpinsrd    xmm4, xmm4, eax, 1
+	LONG $0x2251e3c4; WORD $0x01ea // vpinsrd    xmm5, xmm5, edx, 1
+	LONG $0x2259e3c4; WORD $0x02e1 // vpinsrd    xmm4, xmm4, ecx, 2
+	LONG $0x2251c3c4; WORD $0x02eb // vpinsrd    xmm5, xmm5, r11d, 2
+	LONG $0x2259e3c4; WORD $0x03e7 // vpinsrd    xmm4, xmm4, edi, 3
+	LONG $0x2251c3c4; WORD $0x03e9 // vpinsrd    xmm5, xmm5, r9d, 3
+	LONG $0x385de3c4; WORD $0x01e5 // vinserti128    ymm4, ymm4, xmm5, 1
+	LONG $0xe0dbddc5               // vpand    ymm4, ymm4, ymm0
+	LONG $0x7f7ec1c4; WORD $0xe067 // vmovdqu    yword [r15 - 32], ymm4
+	WORD $0x038b                   // mov    eax, dword [rbx]
+	WORD $0x4b8b; BYTE $0xfc       // mov    ecx, dword [rbx - 4]
+	WORD $0xc289                   // mov    edx, eax
+	LONG $0x1ccaa40f               // shld    edx, ecx, 28
+	WORD $0x738b; BYTE $0xf8       // mov    esi, dword [rbx - 8]
+	LONG $0x1af1a40f               // shld    ecx, esi, 26
+	WORD $0x7b8b; BYTE $0xf4       // mov    edi, dword [rbx - 12]
+	LONG $0x636ffac5; BYTE $0xe4   // vmovdqu    xmm4, oword [rbx - 28]
+	LONG $0x18fea40f               // shld    esi, edi, 24
+	LONG $0x4559e2c4; BYTE $0xe9   // vpsrlvd    xmm5, xmm4, xmm1
+	LONG $0xe470f9c5; BYTE $0xf9   // vpshufd    xmm4, xmm4, 249
+	LONG $0x2259e3c4; WORD $0x03e7 // vpinsrd    xmm4, xmm4, edi, 3
+	LONG $0xf66ef9c5               // vmovd    xmm6, esi
+	LONG $0x2249e3c4; WORD $0x01f1 // vpinsrd    xmm6, xmm6, ecx, 1
+	LONG $0x4759e2c4; BYTE $0xe2   // vpsllvd    xmm4, xmm4, xmm2
+	LONG $0x2249e3c4; WORD $0x02f2 // vpinsrd    xmm6, xmm6, edx, 2
+	LONG $0x2249e3c4; WORD $0x03f0 // vpinsrd    xmm6, xmm6, eax, 3
+	LONG $0xe4ebd1c5               // vpor    xmm4, xmm5, xmm4
+	LONG $0x385de3c4; WORD $0x01e6 // vinserti128    ymm4, ymm4, xmm6, 1
+	LONG $0x455de2c4; BYTE $0xe3   // vpsrlvd    ymm4, ymm4, ymm3
+	LONG $0xe0dbddc5               // vpand    ymm4, ymm4, ymm0
+	LONG $0x7f7ec1c4; BYTE $0x27   // vmovdqu    yword [r15], ymm4
+	LONG $0x80ef8349               // sub    r15, -128
+	LONG $0x78c38348               // add    rbx, 120
+	LONG $0xffc08349               // add    r8, -1
+	JNE  LBB0_101
+
+LBB0_147:
+	LONG $0x05e6c141         // shl    r14d, 5
+	WORD $0x8944; BYTE $0xf0 // mov    eax, r14d
+	MOVQ 0(SP), SP
+	VZEROUPPER
+	MOVQ AX, num+32(FP)
+	RET
diff --git a/go/parquet/internal/utils/bit_packing_default.go b/go/parquet/internal/utils/bit_packing_default.go
new file mode 100644
index 00000000000..64f4fecd2b7
--- /dev/null
+++ b/go/parquet/internal/utils/bit_packing_default.go
@@ -0,0 +1,1941 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"io"
+)
+
+type unpackFunc func(in io.Reader, out []uint32)
+
+func unpack1_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	for idx := range out[:32] {
+		out[idx] = (inl >> idx) & 1
+	}
+}
+
+func unpack2_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 2)
+	out[1] = (inl >> 2) % (1 << 2)
+	out[2] = (inl >> 4) % (1 << 2)
+	out[3] = (inl >> 6) % (1 << 2)
+	out[4] = (inl >> 8) % (1 << 2)
+	out[5] = (inl >> 10) % (1 << 2)
+	out[6] = (inl >> 12) % (1 << 2)
+	out[7] = (inl >> 14) % (1 << 2)
+	out[8] = (inl >> 16) % (1 << 2)
+	out[9] = (inl >> 18) % (1 << 2)
+	out[10] = (inl >> 20) % (1 << 2)
+	out[11] = (inl >> 22) % (1 << 2)
+	out[12] = (inl >> 24) % (1 << 2)
+	out[13] = (inl >> 26) % (1 << 2)
+	out[14] = (inl >> 28) % (1 << 2)
+	out[15] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 2)
+	out[17] = (inl >> 2) % (1 << 2)
+	out[18] = (inl >> 4) % (1 << 2)
+	out[19] = (inl >> 6) % (1 << 2)
+	out[20] = (inl >> 8) % (1 << 2)
+	out[21] = (inl >> 10) % (1 << 2)
+	out[22] = (inl >> 12) % (1 << 2)
+	out[23] = (inl >> 14) % (1 << 2)
+	out[24] = (inl >> 16) % (1 << 2)
+	out[25] = (inl >> 18) % (1 << 2)
+	out[26] = (inl >> 20) % (1 << 2)
+	out[27] = (inl >> 22) % (1 << 2)
+	out[28] = (inl >> 24) % (1 << 2)
+	out[29] = (inl >> 26) % (1 << 2)
+	out[30] = (inl >> 28) % (1 << 2)
+	out[31] = (inl >> 30)
+}
+
+func unpack3_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 3)
+	out[1] = (inl >> 3) % (1 << 3)
+	out[2] = (inl >> 6) % (1 << 3)
+	out[3] = (inl >> 9) % (1 << 3)
+	out[4] = (inl >> 12) % (1 << 3)
+	out[5] = (inl >> 15) % (1 << 3)
+	out[6] = (inl >> 18) % (1 << 3)
+	out[7] = (inl >> 21) % (1 << 3)
+	out[8] = (inl >> 24) % (1 << 3)
+	out[9] = (inl >> 27) % (1 << 3)
+	out[10] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 1)) << (3 - 1)
+	out[11] = (inl >> 1) % (1 << 3)
+	out[12] = (inl >> 4) % (1 << 3)
+	out[13] = (inl >> 7) % (1 << 3)
+	out[14] = (inl >> 10) % (1 << 3)
+	out[15] = (inl >> 13) % (1 << 3)
+	out[16] = (inl >> 16) % (1 << 3)
+	out[17] = (inl >> 19) % (1 << 3)
+	out[18] = (inl >> 22) % (1 << 3)
+	out[19] = (inl >> 25) % (1 << 3)
+	out[20] = (inl >> 28) % (1 << 3)
+	out[21] = (inl >> 31) % (1 << 3)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 2)) << (3 - 2)
+	out[22] = (inl >> 2) % (1 << 3)
+	out[23] = (inl >> 5) % (1 << 3)
+	out[24] = (inl >> 8) % (1 << 3)
+	out[25] = (inl >> 11) % (1 << 3)
+	out[26] = (inl >> 14) % (1 << 3)
+	out[27] = (inl >> 17) % (1 << 3)
+	out[28] = (inl >> 20) % (1 << 3)
+	out[29] = (inl >> 23) % (1 << 3)
+	out[30] = (inl >> 26) % (1 << 3)
+	out[31] = (inl >> 29)
+}
+
+func unpack4_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 28; i += 4 {
+		out[i/4] = (inl >> i) % (1 << 4)
+	}
+	out[7] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 28; i += 4 {
+		out[8+i/4] = (inl >> i) % (1 << 4)
+	}
+	out[15] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 28; i += 4 {
+		out[16+i/4] = (inl >> i) % (1 << 4)
+	}
+	out[23] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 28; i += 4 {
+		out[24+i/4] = (inl >> i) % (1 << 4)
+	}
+	out[31] = (inl >> 28)
+}
+
+func unpack5_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 5)
+	out[1] = (inl >> 5) % (1 << 5)
+	out[2] = (inl >> 10) % (1 << 5)
+	out[3] = (inl >> 15) % (1 << 5)
+	out[4] = (inl >> 20) % (1 << 5)
+	out[5] = (inl >> 25) % (1 << 5)
+	out[6] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 3)) << (5 - 3)
+	out[7] = (inl >> 3) % (1 << 5)
+	out[8] = (inl >> 8) % (1 << 5)
+	out[9] = (inl >> 13) % (1 << 5)
+	out[10] = (inl >> 18) % (1 << 5)
+	out[11] = (inl >> 23) % (1 << 5)
+	out[12] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 1)) << (5 - 1)
+	out[13] = (inl >> 1) % (1 << 5)
+	out[14] = (inl >> 6) % (1 << 5)
+	out[15] = (inl >> 11) % (1 << 5)
+	out[16] = (inl >> 16) % (1 << 5)
+	out[17] = (inl >> 21) % (1 << 5)
+	out[18] = (inl >> 26) % (1 << 5)
+	out[19] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 4)) << (5 - 4)
+	out[20] = (inl >> 4) % (1 << 5)
+	out[21] = (inl >> 9) % (1 << 5)
+	out[22] = (inl >> 14) % (1 << 5)
+	out[23] = (inl >> 19) % (1 << 5)
+	out[24] = (inl >> 24) % (1 << 5)
+	out[25] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 2)) << (5 - 2)
+	out[26] = (inl >> 2) % (1 << 5)
+	out[27] = (inl >> 7) % (1 << 5)
+	out[28] = (inl >> 12) % (1 << 5)
+	out[29] = (inl >> 17) % (1 << 5)
+	out[30] = (inl >> 22) % (1 << 5)
+	out[31] = (inl >> 27)
+}
+
+func unpack6_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 6)
+	out[1] = (inl >> 6) % (1 << 6)
+	out[2] = (inl >> 12) % (1 << 6)
+	out[3] = (inl >> 18) % (1 << 6)
+	out[4] = (inl >> 24) % (1 << 6)
+	out[5] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 4)) << (6 - 4)
+	out[6] = (inl >> 4) % (1 << 6)
+	out[7] = (inl >> 10) % (1 << 6)
+	out[8] = (inl >> 16) % (1 << 6)
+	out[9] = (inl >> 22) % (1 << 6)
+	out[10] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 2)) << (6 - 2)
+	out[11] = (inl >> 2) % (1 << 6)
+	out[12] = (inl >> 8) % (1 << 6)
+	out[13] = (inl >> 14) % (1 << 6)
+	out[14] = (inl >> 20) % (1 << 6)
+	out[15] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 6)
+	out[17] = (inl >> 6) % (1 << 6)
+	out[18] = (inl >> 12) % (1 << 6)
+	out[19] = (inl >> 18) % (1 << 6)
+	out[20] = (inl >> 24) % (1 << 6)
+	out[21] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 4)) << (6 - 4)
+	out[22] = (inl >> 4) % (1 << 6)
+	out[23] = (inl >> 10) % (1 << 6)
+	out[24] = (inl >> 16) % (1 << 6)
+	out[25] = (inl >> 22) % (1 << 6)
+	out[26] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 2)) << (6 - 2)
+	out[27] = (inl >> 2) % (1 << 6)
+	out[28] = (inl >> 8) % (1 << 6)
+	out[29] = (inl >> 14) % (1 << 6)
+	out[30] = (inl >> 20) % (1 << 6)
+	out[31] = (inl >> 26)
+}
+
+func unpack7_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 7)
+	out[1] = (inl >> 7) % (1 << 7)
+	out[2] = (inl >> 14) % (1 << 7)
+	out[3] = (inl >> 21) % (1 << 7)
+	out[4] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 3)) << (7 - 3)
+	out[5] = (inl >> 3) % (1 << 7)
+	out[6] = (inl >> 10) % (1 << 7)
+	out[7] = (inl >> 17) % (1 << 7)
+	out[8] = (inl >> 24) % (1 << 7)
+	out[9] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 6)) << (7 - 6)
+	out[10] = (inl >> 6) % (1 << 7)
+	out[11] = (inl >> 13) % (1 << 7)
+	out[12] = (inl >> 20) % (1 << 7)
+	out[13] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 2)) << (7 - 2)
+	out[14] = (inl >> 2) % (1 << 7)
+	out[15] = (inl >> 9) % (1 << 7)
+	out[16] = (inl >> 16) % (1 << 7)
+	out[17] = (inl >> 23) % (1 << 7)
+	out[18] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 5)) << (7 - 5)
+	out[19] = (inl >> 5) % (1 << 7)
+	out[20] = (inl >> 12) % (1 << 7)
+	out[21] = (inl >> 19) % (1 << 7)
+	out[22] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 1)) << (7 - 1)
+	out[23] = (inl >> 1) % (1 << 7)
+	out[24] = (inl >> 8) % (1 << 7)
+	out[25] = (inl >> 15) % (1 << 7)
+	out[26] = (inl >> 22) % (1 << 7)
+	out[27] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 4)) << (7 - 4)
+	out[28] = (inl >> 4) % (1 << 7)
+	out[29] = (inl >> 11) % (1 << 7)
+	out[30] = (inl >> 18) % (1 << 7)
+	out[31] = (inl >> 25)
+}
+
+func unpack8_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[0+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[3] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[4+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[7] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[8+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[11] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[12+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[15] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[16+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[19] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[20+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[23] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[24+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[27] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	for i := 0; i < 24; i += 8 {
+		out[28+i/8] = (inl >> i) % (1 << 8)
+	}
+	out[31] = (inl >> 24)
+}
+
+func unpack9_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 9)
+	out[1] = (inl >> 9) % (1 << 9)
+	out[2] = (inl >> 18) % (1 << 9)
+	out[3] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 4)) << (9 - 4)
+	out[4] = (inl >> 4) % (1 << 9)
+	out[5] = (inl >> 13) % (1 << 9)
+	out[6] = (inl >> 22) % (1 << 9)
+	out[7] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 8)) << (9 - 8)
+	out[8] = (inl >> 8) % (1 << 9)
+	out[9] = (inl >> 17) % (1 << 9)
+	out[10] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 3)) << (9 - 3)
+	out[11] = (inl >> 3) % (1 << 9)
+	out[12] = (inl >> 12) % (1 << 9)
+	out[13] = (inl >> 21) % (1 << 9)
+	out[14] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 7)) << (9 - 7)
+	out[15] = (inl >> 7) % (1 << 9)
+	out[16] = (inl >> 16) % (1 << 9)
+	out[17] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 2)) << (9 - 2)
+	out[18] = (inl >> 2) % (1 << 9)
+	out[19] = (inl >> 11) % (1 << 9)
+	out[20] = (inl >> 20) % (1 << 9)
+	out[21] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 6)) << (9 - 6)
+	out[22] = (inl >> 6) % (1 << 9)
+	out[23] = (inl >> 15) % (1 << 9)
+	out[24] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 1)) << (9 - 1)
+	out[25] = (inl >> 1) % (1 << 9)
+	out[26] = (inl >> 10) % (1 << 9)
+	out[27] = (inl >> 19) % (1 << 9)
+	out[28] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 5)) << (9 - 5)
+	out[29] = (inl >> 5) % (1 << 9)
+	out[30] = (inl >> 14) % (1 << 9)
+	out[31] = (inl >> 23)
+}
+
+func unpack10_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 10)
+	out[1] = (inl >> 10) % (1 << 10)
+	out[2] = (inl >> 20) % (1 << 10)
+	out[3] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 8)) << (10 - 8)
+	out[4] = (inl >> 8) % (1 << 10)
+	out[5] = (inl >> 18) % (1 << 10)
+	out[6] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 6)) << (10 - 6)
+	out[7] = (inl >> 6) % (1 << 10)
+	out[8] = (inl >> 16) % (1 << 10)
+	out[9] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 4)) << (10 - 4)
+	out[10] = (inl >> 4) % (1 << 10)
+	out[11] = (inl >> 14) % (1 << 10)
+	out[12] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 2)) << (10 - 2)
+	out[13] = (inl >> 2) % (1 << 10)
+	out[14] = (inl >> 12) % (1 << 10)
+	out[15] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 10)
+	out[17] = (inl >> 10) % (1 << 10)
+	out[18] = (inl >> 20) % (1 << 10)
+	out[19] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 8)) << (10 - 8)
+	out[20] = (inl >> 8) % (1 << 10)
+	out[21] = (inl >> 18) % (1 << 10)
+	out[22] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 6)) << (10 - 6)
+	out[23] = (inl >> 6) % (1 << 10)
+	out[24] = (inl >> 16) % (1 << 10)
+	out[25] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 4)) << (10 - 4)
+	out[26] = (inl >> 4) % (1 << 10)
+	out[27] = (inl >> 14) % (1 << 10)
+	out[28] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 2)) << (10 - 2)
+	out[29] = (inl >> 2) % (1 << 10)
+	out[30] = (inl >> 12) % (1 << 10)
+	out[31] = (inl >> 22)
+}
+
+func unpack11_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 11)
+	out[1] = (inl >> 11) % (1 << 11)
+	out[2] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 1)) << (11 - 1)
+	out[3] = (inl >> 1) % (1 << 11)
+	out[4] = (inl >> 12) % (1 << 11)
+	out[5] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 2)) << (11 - 2)
+	out[6] = (inl >> 2) % (1 << 11)
+	out[7] = (inl >> 13) % (1 << 11)
+	out[8] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 3)) << (11 - 3)
+	out[9] = (inl >> 3) % (1 << 11)
+	out[10] = (inl >> 14) % (1 << 11)
+	out[11] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 4)) << (11 - 4)
+	out[12] = (inl >> 4) % (1 << 11)
+	out[13] = (inl >> 15) % (1 << 11)
+	out[14] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 5)) << (11 - 5)
+	out[15] = (inl >> 5) % (1 << 11)
+	out[16] = (inl >> 16) % (1 << 11)
+	out[17] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 6)) << (11 - 6)
+	out[18] = (inl >> 6) % (1 << 11)
+	out[19] = (inl >> 17) % (1 << 11)
+	out[20] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 7)) << (11 - 7)
+	out[21] = (inl >> 7) % (1 << 11)
+	out[22] = (inl >> 18) % (1 << 11)
+	out[23] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 8)) << (11 - 8)
+	out[24] = (inl >> 8) % (1 << 11)
+	out[25] = (inl >> 19) % (1 << 11)
+	out[26] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 9)) << (11 - 9)
+	out[27] = (inl >> 9) % (1 << 11)
+	out[28] = (inl >> 20) % (1 << 11)
+	out[29] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 10)) << (11 - 10)
+	out[30] = (inl >> 10) % (1 << 11)
+	out[31] = (inl >> 21)
+}
+
+func unpack12_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 12)
+	out[1] = (inl >> 12) % (1 << 12)
+	out[2] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 4)) << (12 - 4)
+	out[3] = (inl >> 4) % (1 << 12)
+	out[4] = (inl >> 16) % (1 << 12)
+	out[5] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 8)) << (12 - 8)
+	out[6] = (inl >> 8) % (1 << 12)
+	out[7] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] = (inl >> 0) % (1 << 12)
+	out[9] = (inl >> 12) % (1 << 12)
+	out[10] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 4)) << (12 - 4)
+	out[11] = (inl >> 4) % (1 << 12)
+	out[12] = (inl >> 16) % (1 << 12)
+	out[13] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 8)) << (12 - 8)
+	out[14] = (inl >> 8) % (1 << 12)
+	out[15] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 12)
+	out[17] = (inl >> 12) % (1 << 12)
+	out[18] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 4)) << (12 - 4)
+	out[19] = (inl >> 4) % (1 << 12)
+	out[20] = (inl >> 16) % (1 << 12)
+	out[21] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 8)) << (12 - 8)
+	out[22] = (inl >> 8) % (1 << 12)
+	out[23] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] = (inl >> 0) % (1 << 12)
+	out[25] = (inl >> 12) % (1 << 12)
+	out[26] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 4)) << (12 - 4)
+	out[27] = (inl >> 4) % (1 << 12)
+	out[28] = (inl >> 16) % (1 << 12)
+	out[29] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 8)) << (12 - 8)
+	out[30] = (inl >> 8) % (1 << 12)
+	out[31] = (inl >> 20)
+}
+
+func unpack13_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 13)
+	out[1] = (inl >> 13) % (1 << 13)
+	out[2] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 7)) << (13 - 7)
+	out[3] = (inl >> 7) % (1 << 13)
+	out[4] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 1)) << (13 - 1)
+	out[5] = (inl >> 1) % (1 << 13)
+	out[6] = (inl >> 14) % (1 << 13)
+	out[7] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 8)) << (13 - 8)
+	out[8] = (inl >> 8) % (1 << 13)
+	out[9] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 2)) << (13 - 2)
+	out[10] = (inl >> 2) % (1 << 13)
+	out[11] = (inl >> 15) % (1 << 13)
+	out[12] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 9)) << (13 - 9)
+	out[13] = (inl >> 9) % (1 << 13)
+	out[14] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 3)) << (13 - 3)
+	out[15] = (inl >> 3) % (1 << 13)
+	out[16] = (inl >> 16) % (1 << 13)
+	out[17] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 10)) << (13 - 10)
+	out[18] = (inl >> 10) % (1 << 13)
+	out[19] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 4)) << (13 - 4)
+	out[20] = (inl >> 4) % (1 << 13)
+	out[21] = (inl >> 17) % (1 << 13)
+	out[22] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 11)) << (13 - 11)
+	out[23] = (inl >> 11) % (1 << 13)
+	out[24] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 5)) << (13 - 5)
+	out[25] = (inl >> 5) % (1 << 13)
+	out[26] = (inl >> 18) % (1 << 13)
+	out[27] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 12)) << (13 - 12)
+	out[28] = (inl >> 12) % (1 << 13)
+	out[29] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 6)) << (13 - 6)
+	out[30] = (inl >> 6) % (1 << 13)
+	out[31] = (inl >> 19)
+}
+
+func unpack14_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 14)
+	out[1] = (inl >> 14) % (1 << 14)
+	out[2] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 10)) << (14 - 10)
+	out[3] = (inl >> 10) % (1 << 14)
+	out[4] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 6)) << (14 - 6)
+	out[5] = (inl >> 6) % (1 << 14)
+	out[6] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 2)) << (14 - 2)
+	out[7] = (inl >> 2) % (1 << 14)
+	out[8] = (inl >> 16) % (1 << 14)
+	out[9] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 12)) << (14 - 12)
+	out[10] = (inl >> 12) % (1 << 14)
+	out[11] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 8)) << (14 - 8)
+	out[12] = (inl >> 8) % (1 << 14)
+	out[13] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 4)) << (14 - 4)
+	out[14] = (inl >> 4) % (1 << 14)
+	out[15] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 14)
+	out[17] = (inl >> 14) % (1 << 14)
+	out[18] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 10)) << (14 - 10)
+	out[19] = (inl >> 10) % (1 << 14)
+	out[20] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 6)) << (14 - 6)
+	out[21] = (inl >> 6) % (1 << 14)
+	out[22] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 2)) << (14 - 2)
+	out[23] = (inl >> 2) % (1 << 14)
+	out[24] = (inl >> 16) % (1 << 14)
+	out[25] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 12)) << (14 - 12)
+	out[26] = (inl >> 12) % (1 << 14)
+	out[27] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 8)) << (14 - 8)
+	out[28] = (inl >> 8) % (1 << 14)
+	out[29] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 4)) << (14 - 4)
+	out[30] = (inl >> 4) % (1 << 14)
+	out[31] = (inl >> 18)
+}
+
+func unpack15_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 15)
+	out[1] = (inl >> 15) % (1 << 15)
+	out[2] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 13)) << (15 - 13)
+	out[3] = (inl >> 13) % (1 << 15)
+	out[4] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 11)) << (15 - 11)
+	out[5] = (inl >> 11) % (1 << 15)
+	out[6] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 9)) << (15 - 9)
+	out[7] = (inl >> 9) % (1 << 15)
+	out[8] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 7)) << (15 - 7)
+	out[9] = (inl >> 7) % (1 << 15)
+	out[10] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 5)) << (15 - 5)
+	out[11] = (inl >> 5) % (1 << 15)
+	out[12] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 3)) << (15 - 3)
+	out[13] = (inl >> 3) % (1 << 15)
+	out[14] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 1)) << (15 - 1)
+	out[15] = (inl >> 1) % (1 << 15)
+	out[16] = (inl >> 16) % (1 << 15)
+	out[17] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 14)) << (15 - 14)
+	out[18] = (inl >> 14) % (1 << 15)
+	out[19] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 12)) << (15 - 12)
+	out[20] = (inl >> 12) % (1 << 15)
+	out[21] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 10)) << (15 - 10)
+	out[22] = (inl >> 10) % (1 << 15)
+	out[23] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 8)) << (15 - 8)
+	out[24] = (inl >> 8) % (1 << 15)
+	out[25] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 6)) << (15 - 6)
+	out[26] = (inl >> 6) % (1 << 15)
+	out[27] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 4)) << (15 - 4)
+	out[28] = (inl >> 4) % (1 << 15)
+	out[29] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 2)) << (15 - 2)
+	out[30] = (inl >> 2) % (1 << 15)
+	out[31] = (inl >> 17)
+}
+
+func unpack16_32(in io.Reader, out []uint32) {
+	var inl uint32
+	for i := 0; i < 16; i++ {
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[i*2] = (inl >> 0) % (1 << 16)
+		out[(i*2)+1] = (inl >> 16)
+	}
+}
+
+func unpack17_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 17)
+	out[1] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 2)) << (17 - 2)
+	out[2] = (inl >> 2) % (1 << 17)
+	out[3] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 4)) << (17 - 4)
+	out[4] = (inl >> 4) % (1 << 17)
+	out[5] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 6)) << (17 - 6)
+	out[6] = (inl >> 6) % (1 << 17)
+	out[7] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 8)) << (17 - 8)
+	out[8] = (inl >> 8) % (1 << 17)
+	out[9] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 10)) << (17 - 10)
+	out[10] = (inl >> 10) % (1 << 17)
+	out[11] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 12)) << (17 - 12)
+	out[12] = (inl >> 12) % (1 << 17)
+	out[13] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 14)) << (17 - 14)
+	out[14] = (inl >> 14) % (1 << 17)
+	out[15] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (17 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 1)) << (17 - 1)
+	out[17] = (inl >> 1) % (1 << 17)
+	out[18] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 3)) << (17 - 3)
+	out[19] = (inl >> 3) % (1 << 17)
+	out[20] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 5)) << (17 - 5)
+	out[21] = (inl >> 5) % (1 << 17)
+	out[22] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 7)) << (17 - 7)
+	out[23] = (inl >> 7) % (1 << 17)
+	out[24] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 9)) << (17 - 9)
+	out[25] = (inl >> 9) % (1 << 17)
+	out[26] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 11)) << (17 - 11)
+	out[27] = (inl >> 11) % (1 << 17)
+	out[28] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 13)) << (17 - 13)
+	out[29] = (inl >> 13) % (1 << 17)
+	out[30] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 15)) << (17 - 15)
+	out[31] = (inl >> 15)
+}
+
+func unpack18_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 18)
+	out[1] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 4)) << (18 - 4)
+	out[2] = (inl >> 4) % (1 << 18)
+	out[3] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 8)) << (18 - 8)
+	out[4] = (inl >> 8) % (1 << 18)
+	out[5] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 12)) << (18 - 12)
+	out[6] = (inl >> 12) % (1 << 18)
+	out[7] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 16)) << (18 - 16)
+	out[8] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 2)) << (18 - 2)
+	out[9] = (inl >> 2) % (1 << 18)
+	out[10] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 6)) << (18 - 6)
+	out[11] = (inl >> 6) % (1 << 18)
+	out[12] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 10)) << (18 - 10)
+	out[13] = (inl >> 10) % (1 << 18)
+	out[14] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 14)) << (18 - 14)
+	out[15] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 18)
+	out[17] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 4)) << (18 - 4)
+	out[18] = (inl >> 4) % (1 << 18)
+	out[19] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 8)) << (18 - 8)
+	out[20] = (inl >> 8) % (1 << 18)
+	out[21] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 12)) << (18 - 12)
+	out[22] = (inl >> 12) % (1 << 18)
+	out[23] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 16)) << (18 - 16)
+	out[24] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 2)) << (18 - 2)
+	out[25] = (inl >> 2) % (1 << 18)
+	out[26] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 6)) << (18 - 6)
+	out[27] = (inl >> 6) % (1 << 18)
+	out[28] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 10)) << (18 - 10)
+	out[29] = (inl >> 10) % (1 << 18)
+	out[30] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 14)) << (18 - 14)
+	out[31] = (inl >> 14)
+}
+
+func unpack19_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 19)
+	out[1] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 6)) << (19 - 6)
+	out[2] = (inl >> 6) % (1 << 19)
+	out[3] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 12)) << (19 - 12)
+	out[4] = (inl >> 12) % (1 << 19)
+	out[5] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 18)) << (19 - 18)
+	out[6] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 5)) << (19 - 5)
+	out[7] = (inl >> 5) % (1 << 19)
+	out[8] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 11)) << (19 - 11)
+	out[9] = (inl >> 11) % (1 << 19)
+	out[10] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 17)) << (19 - 17)
+	out[11] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 4)) << (19 - 4)
+	out[12] = (inl >> 4) % (1 << 19)
+	out[13] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 10)) << (19 - 10)
+	out[14] = (inl >> 10) % (1 << 19)
+	out[15] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (19 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 3)) << (19 - 3)
+	out[17] = (inl >> 3) % (1 << 19)
+	out[18] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 9)) << (19 - 9)
+	out[19] = (inl >> 9) % (1 << 19)
+	out[20] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 15)) << (19 - 15)
+	out[21] = (inl >> 15)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 2)) << (19 - 2)
+	out[22] = (inl >> 2) % (1 << 19)
+	out[23] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 8)) << (19 - 8)
+	out[24] = (inl >> 8) % (1 << 19)
+	out[25] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 14)) << (19 - 14)
+	out[26] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 1)) << (19 - 1)
+	out[27] = (inl >> 1) % (1 << 19)
+	out[28] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 7)) << (19 - 7)
+	out[29] = (inl >> 7) % (1 << 19)
+	out[30] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 13)) << (19 - 13)
+	out[31] = (inl >> 13)
+}
+
+func unpack20_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 20)
+	out[1] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 8)) << (20 - 8)
+	out[2] = (inl >> 8) % (1 << 20)
+	out[3] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 16)) << (20 - 16)
+	out[4] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 4)) << (20 - 4)
+	out[5] = (inl >> 4) % (1 << 20)
+	out[6] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 12)) << (20 - 12)
+	out[7] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] = (inl >> 0) % (1 << 20)
+	out[9] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 8)) << (20 - 8)
+	out[10] = (inl >> 8) % (1 << 20)
+	out[11] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 16)) << (20 - 16)
+	out[12] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 4)) << (20 - 4)
+	out[13] = (inl >> 4) % (1 << 20)
+	out[14] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 12)) << (20 - 12)
+	out[15] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 20)
+	out[17] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 8)) << (20 - 8)
+	out[18] = (inl >> 8) % (1 << 20)
+	out[19] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 16)) << (20 - 16)
+	out[20] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 4)) << (20 - 4)
+	out[21] = (inl >> 4) % (1 << 20)
+	out[22] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 12)) << (20 - 12)
+	out[23] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] = (inl >> 0) % (1 << 20)
+	out[25] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 8)) << (20 - 8)
+	out[26] = (inl >> 8) % (1 << 20)
+	out[27] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 16)) << (20 - 16)
+	out[28] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 4)) << (20 - 4)
+	out[29] = (inl >> 4) % (1 << 20)
+	out[30] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 12)) << (20 - 12)
+	out[31] = (inl >> 12)
+}
+
+func unpack21_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 21)
+	out[1] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 10)) << (21 - 10)
+	out[2] = (inl >> 10) % (1 << 21)
+	out[3] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 20)) << (21 - 20)
+	out[4] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 9)) << (21 - 9)
+	out[5] = (inl >> 9) % (1 << 21)
+	out[6] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 19)) << (21 - 19)
+	out[7] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 8)) << (21 - 8)
+	out[8] = (inl >> 8) % (1 << 21)
+	out[9] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 18)) << (21 - 18)
+	out[10] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 7)) << (21 - 7)
+	out[11] = (inl >> 7) % (1 << 21)
+	out[12] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 17)) << (21 - 17)
+	out[13] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 6)) << (21 - 6)
+	out[14] = (inl >> 6) % (1 << 21)
+	out[15] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (21 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 5)) << (21 - 5)
+	out[17] = (inl >> 5) % (1 << 21)
+	out[18] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 15)) << (21 - 15)
+	out[19] = (inl >> 15)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 4)) << (21 - 4)
+	out[20] = (inl >> 4) % (1 << 21)
+	out[21] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 14)) << (21 - 14)
+	out[22] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 3)) << (21 - 3)
+	out[23] = (inl >> 3) % (1 << 21)
+	out[24] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 13)) << (21 - 13)
+	out[25] = (inl >> 13)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 2)) << (21 - 2)
+	out[26] = (inl >> 2) % (1 << 21)
+	out[27] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 12)) << (21 - 12)
+	out[28] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 1)) << (21 - 1)
+	out[29] = (inl >> 1) % (1 << 21)
+	out[30] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 11)) << (21 - 11)
+	out[31] = (inl >> 11)
+}
+
+func unpack22_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 22)
+	out[1] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 12)) << (22 - 12)
+	out[2] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 2)) << (22 - 2)
+	out[3] = (inl >> 2) % (1 << 22)
+	out[4] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 14)) << (22 - 14)
+	out[5] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 4)) << (22 - 4)
+	out[6] = (inl >> 4) % (1 << 22)
+	out[7] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 16)) << (22 - 16)
+	out[8] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 6)) << (22 - 6)
+	out[9] = (inl >> 6) % (1 << 22)
+	out[10] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 18)) << (22 - 18)
+	out[11] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 8)) << (22 - 8)
+	out[12] = (inl >> 8) % (1 << 22)
+	out[13] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 20)) << (22 - 20)
+	out[14] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 10)) << (22 - 10)
+	out[15] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 22)
+	out[17] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 12)) << (22 - 12)
+	out[18] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 2)) << (22 - 2)
+	out[19] = (inl >> 2) % (1 << 22)
+	out[20] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 14)) << (22 - 14)
+	out[21] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 4)) << (22 - 4)
+	out[22] = (inl >> 4) % (1 << 22)
+	out[23] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 16)) << (22 - 16)
+	out[24] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 6)) << (22 - 6)
+	out[25] = (inl >> 6) % (1 << 22)
+	out[26] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 18)) << (22 - 18)
+	out[27] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 8)) << (22 - 8)
+	out[28] = (inl >> 8) % (1 << 22)
+	out[29] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 20)) << (22 - 20)
+	out[30] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 10)) << (22 - 10)
+	out[31] = (inl >> 10)
+}
+
+func unpack23_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 23)
+	out[1] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 14)) << (23 - 14)
+	out[2] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 5)) << (23 - 5)
+	out[3] = (inl >> 5) % (1 << 23)
+	out[4] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 19)) << (23 - 19)
+	out[5] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 10)) << (23 - 10)
+	out[6] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 1)) << (23 - 1)
+	out[7] = (inl >> 1) % (1 << 23)
+	out[8] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 15)) << (23 - 15)
+	out[9] = (inl >> 15)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 6)) << (23 - 6)
+	out[10] = (inl >> 6) % (1 << 23)
+	out[11] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 20)) << (23 - 20)
+	out[12] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 11)) << (23 - 11)
+	out[13] = (inl >> 11)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 2)) << (23 - 2)
+	out[14] = (inl >> 2) % (1 << 23)
+	out[15] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (23 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 7)) << (23 - 7)
+	out[17] = (inl >> 7) % (1 << 23)
+	out[18] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 21)) << (23 - 21)
+	out[19] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 12)) << (23 - 12)
+	out[20] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 3)) << (23 - 3)
+	out[21] = (inl >> 3) % (1 << 23)
+	out[22] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 17)) << (23 - 17)
+	out[23] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 8)) << (23 - 8)
+	out[24] = (inl >> 8) % (1 << 23)
+	out[25] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 22)) << (23 - 22)
+	out[26] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 13)) << (23 - 13)
+	out[27] = (inl >> 13)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 4)) << (23 - 4)
+	out[28] = (inl >> 4) % (1 << 23)
+	out[29] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 18)) << (23 - 18)
+	out[30] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 9)) << (23 - 9)
+	out[31] = (inl >> 9)
+}
+
+func unpack24_32(in io.Reader, out []uint32) {
+	var inl uint32
+	for i := 0; i < 8; i++ {
+		base := i * 4
+
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base] = (inl >> 0) % (1 << 24)
+		out[base+1] = (inl >> 24)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+1] |= (inl % (1 << 16)) << (24 - 16)
+		out[base+2] = (inl >> 16)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+2] |= (inl % (1 << 8)) << (24 - 8)
+		out[base+3] = (inl >> 8)
+	}
+}
+
+func unpack25_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 25)
+	out[1] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 18)) << (25 - 18)
+	out[2] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 11)) << (25 - 11)
+	out[3] = (inl >> 11)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 4)) << (25 - 4)
+	out[4] = (inl >> 4) % (1 << 25)
+	out[5] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 22)) << (25 - 22)
+	out[6] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 15)) << (25 - 15)
+	out[7] = (inl >> 15)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 8)) << (25 - 8)
+	out[8] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 1)) << (25 - 1)
+	out[9] = (inl >> 1) % (1 << 25)
+	out[10] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 19)) << (25 - 19)
+	out[11] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 12)) << (25 - 12)
+	out[12] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 5)) << (25 - 5)
+	out[13] = (inl >> 5) % (1 << 25)
+	out[14] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 23)) << (25 - 23)
+	out[15] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (25 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 9)) << (25 - 9)
+	out[17] = (inl >> 9)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 2)) << (25 - 2)
+	out[18] = (inl >> 2) % (1 << 25)
+	out[19] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 20)) << (25 - 20)
+	out[20] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 13)) << (25 - 13)
+	out[21] = (inl >> 13)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 6)) << (25 - 6)
+	out[22] = (inl >> 6) % (1 << 25)
+	out[23] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 24)) << (25 - 24)
+	out[24] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 17)) << (25 - 17)
+	out[25] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 10)) << (25 - 10)
+	out[26] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 3)) << (25 - 3)
+	out[27] = (inl >> 3) % (1 << 25)
+	out[28] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 21)) << (25 - 21)
+	out[29] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 14)) << (25 - 14)
+	out[30] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 7)) << (25 - 7)
+	out[31] = (inl >> 7)
+}
+
+func unpack26_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 26)
+	out[1] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 20)) << (26 - 20)
+	out[2] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 14)) << (26 - 14)
+	out[3] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 8)) << (26 - 8)
+	out[4] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 2)) << (26 - 2)
+	out[5] = (inl >> 2) % (1 << 26)
+	out[6] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 22)) << (26 - 22)
+	out[7] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 16)) << (26 - 16)
+	out[8] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 10)) << (26 - 10)
+	out[9] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 4)) << (26 - 4)
+	out[10] = (inl >> 4) % (1 << 26)
+	out[11] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 24)) << (26 - 24)
+	out[12] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 18)) << (26 - 18)
+	out[13] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 12)) << (26 - 12)
+	out[14] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 6)) << (26 - 6)
+	out[15] = (inl >> 6)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 26)
+	out[17] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 20)) << (26 - 20)
+	out[18] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 14)) << (26 - 14)
+	out[19] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 8)) << (26 - 8)
+	out[20] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 2)) << (26 - 2)
+	out[21] = (inl >> 2) % (1 << 26)
+	out[22] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 22)) << (26 - 22)
+	out[23] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 16)) << (26 - 16)
+	out[24] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 10)) << (26 - 10)
+	out[25] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 4)) << (26 - 4)
+	out[26] = (inl >> 4) % (1 << 26)
+	out[27] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 24)) << (26 - 24)
+	out[28] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 18)) << (26 - 18)
+	out[29] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 12)) << (26 - 12)
+	out[30] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 6)) << (26 - 6)
+	out[31] = (inl >> 6)
+}
+
+func unpack27_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 27)
+	out[1] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 22)) << (27 - 22)
+	out[2] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 17)) << (27 - 17)
+	out[3] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 12)) << (27 - 12)
+	out[4] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 7)) << (27 - 7)
+	out[5] = (inl >> 7)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 2)) << (27 - 2)
+	out[6] = (inl >> 2) % (1 << 27)
+	out[7] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 24)) << (27 - 24)
+	out[8] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 19)) << (27 - 19)
+	out[9] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 14)) << (27 - 14)
+	out[10] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 9)) << (27 - 9)
+	out[11] = (inl >> 9)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 4)) << (27 - 4)
+	out[12] = (inl >> 4) % (1 << 27)
+	out[13] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 26)) << (27 - 26)
+	out[14] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 21)) << (27 - 21)
+	out[15] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (27 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 11)) << (27 - 11)
+	out[17] = (inl >> 11)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 6)) << (27 - 6)
+	out[18] = (inl >> 6)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 1)) << (27 - 1)
+	out[19] = (inl >> 1) % (1 << 27)
+	out[20] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 23)) << (27 - 23)
+	out[21] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 18)) << (27 - 18)
+	out[22] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 13)) << (27 - 13)
+	out[23] = (inl >> 13)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 8)) << (27 - 8)
+	out[24] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 3)) << (27 - 3)
+	out[25] = (inl >> 3) % (1 << 27)
+	out[26] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 25)) << (27 - 25)
+	out[27] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 20)) << (27 - 20)
+	out[28] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 15)) << (27 - 15)
+	out[29] = (inl >> 15)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 10)) << (27 - 10)
+	out[30] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 5)) << (27 - 5)
+	out[31] = (inl >> 5)
+}
+
+func unpack28_32(in io.Reader, out []uint32) {
+	var inl uint32
+	for i := 0; i < 4; i++ {
+		base := i * 8
+
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base] = (inl >> 0) % (1 << 28)
+		out[base+1] = (inl >> 28)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+1] |= (inl % (1 << 24)) << (28 - 24)
+		out[base+2] = (inl >> 24)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+2] |= (inl % (1 << 20)) << (28 - 20)
+		out[base+3] = (inl >> 20)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+3] |= (inl % (1 << 16)) << (28 - 16)
+		out[base+4] = (inl >> 16)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+4] |= (inl % (1 << 12)) << (28 - 12)
+		out[base+5] = (inl >> 12)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+5] |= (inl % (1 << 8)) << (28 - 8)
+		out[base+6] = (inl >> 8)
+		binary.Read(in, binary.LittleEndian, &inl)
+		out[base+6] |= (inl % (1 << 4)) << (28 - 4)
+		out[base+7] = (inl >> 4)
+	}
+}
+
+func unpack29_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 29)
+	out[1] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 26)) << (29 - 26)
+	out[2] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 23)) << (29 - 23)
+	out[3] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 20)) << (29 - 20)
+	out[4] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 17)) << (29 - 17)
+	out[5] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 14)) << (29 - 14)
+	out[6] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 11)) << (29 - 11)
+	out[7] = (inl >> 11)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 8)) << (29 - 8)
+	out[8] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 5)) << (29 - 5)
+	out[9] = (inl >> 5)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 2)) << (29 - 2)
+	out[10] = (inl >> 2) % (1 << 29)
+	out[11] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 28)) << (29 - 28)
+	out[12] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 25)) << (29 - 25)
+	out[13] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 22)) << (29 - 22)
+	out[14] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 19)) << (29 - 19)
+	out[15] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (29 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 13)) << (29 - 13)
+	out[17] = (inl >> 13)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 10)) << (29 - 10)
+	out[18] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 7)) << (29 - 7)
+	out[19] = (inl >> 7)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 4)) << (29 - 4)
+	out[20] = (inl >> 4)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 1)) << (29 - 1)
+	out[21] = (inl >> 1) % (1 << 29)
+	out[22] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 27)) << (29 - 27)
+	out[23] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 24)) << (29 - 24)
+	out[24] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 21)) << (29 - 21)
+	out[25] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 18)) << (29 - 18)
+	out[26] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 15)) << (29 - 15)
+	out[27] = (inl >> 15)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 12)) << (29 - 12)
+	out[28] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 9)) << (29 - 9)
+	out[29] = (inl >> 9)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 6)) << (29 - 6)
+	out[30] = (inl >> 6)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 3)) << (29 - 3)
+	out[31] = (inl >> 3)
+}
+
+func unpack30_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 30)
+	out[1] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 28)) << (30 - 28)
+	out[2] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 26)) << (30 - 26)
+	out[3] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 24)) << (30 - 24)
+	out[4] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 22)) << (30 - 22)
+	out[5] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 20)) << (30 - 20)
+	out[6] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 18)) << (30 - 18)
+	out[7] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 16)) << (30 - 16)
+	out[8] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 14)) << (30 - 14)
+	out[9] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 12)) << (30 - 12)
+	out[10] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 10)) << (30 - 10)
+	out[11] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 8)) << (30 - 8)
+	out[12] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 6)) << (30 - 6)
+	out[13] = (inl >> 6)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 4)) << (30 - 4)
+	out[14] = (inl >> 4)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 2)) << (30 - 2)
+	out[15] = (inl >> 2)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] = (inl >> 0) % (1 << 30)
+	out[17] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 28)) << (30 - 28)
+	out[18] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 26)) << (30 - 26)
+	out[19] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 24)) << (30 - 24)
+	out[20] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 22)) << (30 - 22)
+	out[21] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 20)) << (30 - 20)
+	out[22] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 18)) << (30 - 18)
+	out[23] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 16)) << (30 - 16)
+	out[24] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 14)) << (30 - 14)
+	out[25] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 12)) << (30 - 12)
+	out[26] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 10)) << (30 - 10)
+	out[27] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 8)) << (30 - 8)
+	out[28] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 6)) << (30 - 6)
+	out[29] = (inl >> 6)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 4)) << (30 - 4)
+	out[30] = (inl >> 4)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 2)) << (30 - 2)
+	out[31] = (inl >> 2)
+}
+
+func unpack31_32(in io.Reader, out []uint32) {
+	var inl uint32
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[0] = (inl >> 0) % (1 << 31)
+	out[1] = (inl >> 31)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[1] |= (inl % (1 << 30)) << (31 - 30)
+	out[2] = (inl >> 30)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[2] |= (inl % (1 << 29)) << (31 - 29)
+	out[3] = (inl >> 29)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[3] |= (inl % (1 << 28)) << (31 - 28)
+	out[4] = (inl >> 28)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[4] |= (inl % (1 << 27)) << (31 - 27)
+	out[5] = (inl >> 27)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[5] |= (inl % (1 << 26)) << (31 - 26)
+	out[6] = (inl >> 26)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[6] |= (inl % (1 << 25)) << (31 - 25)
+	out[7] = (inl >> 25)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[7] |= (inl % (1 << 24)) << (31 - 24)
+	out[8] = (inl >> 24)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[8] |= (inl % (1 << 23)) << (31 - 23)
+	out[9] = (inl >> 23)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[9] |= (inl % (1 << 22)) << (31 - 22)
+	out[10] = (inl >> 22)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[10] |= (inl % (1 << 21)) << (31 - 21)
+	out[11] = (inl >> 21)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[11] |= (inl % (1 << 20)) << (31 - 20)
+	out[12] = (inl >> 20)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[12] |= (inl % (1 << 19)) << (31 - 19)
+	out[13] = (inl >> 19)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[13] |= (inl % (1 << 18)) << (31 - 18)
+	out[14] = (inl >> 18)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[14] |= (inl % (1 << 17)) << (31 - 17)
+	out[15] = (inl >> 17)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[15] |= (inl % (1 << 16)) << (31 - 16)
+	out[16] = (inl >> 16)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[16] |= (inl % (1 << 15)) << (31 - 15)
+	out[17] = (inl >> 15)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[17] |= (inl % (1 << 14)) << (31 - 14)
+	out[18] = (inl >> 14)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[18] |= (inl % (1 << 13)) << (31 - 13)
+	out[19] = (inl >> 13)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[19] |= (inl % (1 << 12)) << (31 - 12)
+	out[20] = (inl >> 12)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[20] |= (inl % (1 << 11)) << (31 - 11)
+	out[21] = (inl >> 11)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[21] |= (inl % (1 << 10)) << (31 - 10)
+	out[22] = (inl >> 10)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[22] |= (inl % (1 << 9)) << (31 - 9)
+	out[23] = (inl >> 9)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[23] |= (inl % (1 << 8)) << (31 - 8)
+	out[24] = (inl >> 8)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[24] |= (inl % (1 << 7)) << (31 - 7)
+	out[25] = (inl >> 7)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[25] |= (inl % (1 << 6)) << (31 - 6)
+	out[26] = (inl >> 6)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[26] |= (inl % (1 << 5)) << (31 - 5)
+	out[27] = (inl >> 5)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[27] |= (inl % (1 << 4)) << (31 - 4)
+	out[28] = (inl >> 4)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[28] |= (inl % (1 << 3)) << (31 - 3)
+	out[29] = (inl >> 3)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[29] |= (inl % (1 << 2)) << (31 - 2)
+	out[30] = (inl >> 2)
+	binary.Read(in, binary.LittleEndian, &inl)
+	out[30] |= (inl % (1 << 1)) << (31 - 1)
+	out[31] = (inl >> 1)
+}
+
+func unpack32_32(in io.Reader, out []uint32) {
+	for idx := range out[:32] {
+		binary.Read(in, binary.LittleEndian, &out[idx])
+	}
+}
+
+func nullunpack32(_ io.Reader, out []uint32) {
+	out[0] = 0
+	for i := 1; i < 32; i *= 2 {
+		copy(out[i:], out[:i])
+	}
+}
+
+func unpack32Default(in io.Reader, out []uint32, nbits int) int {
+	batch := len(out) / 32 * 32
+	nloops := batch / 32
+
+	var f unpackFunc
+	switch nbits {
+	case 0:
+		f = nullunpack32
+	case 1:
+		f = unpack1_32
+	case 2:
+		f = unpack2_32
+	case 3:
+		f = unpack3_32
+	case 4:
+		f = unpack4_32
+	case 5:
+		f = unpack5_32
+	case 6:
+		f = unpack6_32
+	case 7:
+		f = unpack7_32
+	case 8:
+		f = unpack8_32
+	case 9:
+		f = unpack9_32
+	case 10:
+		f = unpack10_32
+	case 11:
+		f = unpack11_32
+	case 12:
+		f = unpack12_32
+	case 13:
+		f = unpack13_32
+	case 14:
+		f = unpack14_32
+	case 15:
+		f = unpack15_32
+	case 16:
+		f = unpack16_32
+	case 17:
+		f = unpack17_32
+	case 18:
+		f = unpack18_32
+	case 19:
+		f = unpack19_32
+	case 20:
+		f = unpack20_32
+	case 21:
+		f = unpack21_32
+	case 22:
+		f = unpack22_32
+	case 23:
+		f = unpack23_32
+	case 24:
+		f = unpack24_32
+	case 25:
+		f = unpack25_32
+	case 26:
+		f = unpack26_32
+	case 27:
+		f = unpack27_32
+	case 28:
+		f = unpack28_32
+	case 29:
+		f = unpack29_32
+	case 30:
+		f = unpack30_32
+	case 31:
+		f = unpack31_32
+	case 32:
+		f = unpack32_32
+	default:
+		return 0
+	}
+
+	for i := 0; i < nloops; i++ {
+		f(in, out[i*32:])
+	}
+	return batch
+}
diff --git a/go/parquet/internal/utils/bit_packing_noasm.go b/go/parquet/internal/utils/bit_packing_noasm.go
new file mode 100644
index 00000000000..a5e03814cfc
--- /dev/null
+++ b/go/parquet/internal/utils/bit_packing_noasm.go
@@ -0,0 +1,23 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build noasm
+
+package utils
+
+import "io"
+
+var unpack32 func(io.Reader, []uint32, int) int = unpack32Default
diff --git a/go/parquet/internal/utils/bit_reader.go b/go/parquet/internal/utils/bit_reader.go
new file mode 100644
index 00000000000..1d2803e2678
--- /dev/null
+++ b/go/parquet/internal/utils/bit_reader.go
@@ -0,0 +1,348 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+	"math"
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/arrow/memory"
+)
+
+// masks for grabbing the trailing bits based on the number of trailing bits desired
+var trailingMask [64]uint64
+
+func init() {
+	// generate the masks at init so we don't have to hard code them.
+	for i := 0; i < 64; i++ {
+		trailingMask[i] = (math.MaxUint64 >> (64 - i))
+	}
+}
+
+// trailingBits returns a value constructed from the bits trailing bits of
+// the value v that is passed in. If bits >= 64, then we just return v.
+func trailingBits(v uint64, bits uint) uint64 {
+	if bits >= 64 {
+		return v
+	}
+	return v & trailingMask[bits]
+}
+
+// reader is a useful interface to define the functionality we need for implementation
+type reader interface {
+	io.Reader
+	io.ReaderAt
+	io.Seeker
+}
+
+// default buffer length
+const buflen = 1024
+
+// BitReader implements functionality for reading bits or bytes buffering up to a uint64
+// at a time from the reader in order to improve efficiency. It also provides
+// methods to read multiple bytes in one read such as encoded ints/values.
+//
+// This BitReader is the basis for the other utility classes like RLE decoding
+// and such, providing the necessary functions for interpreting the values.
+type BitReader struct {
+	reader     reader
+	buffer     uint64
+	byteoffset int64
+	bitoffset  uint
+	raw        [8]byte
+
+	unpackBuf [buflen]uint32
+}
+
+// NewBitReader takes in a reader that implements io.Reader, io.ReaderAt and io.Seeker
+// interfaces and returns a BitReader for use with various bit level manipulations.
+func NewBitReader(r reader) *BitReader {
+	return &BitReader{reader: r}
+}
+
+// CurOffset returns the current Byte offset into the data that the reader is at.
+func (b *BitReader) CurOffset() int64 {
+	return b.byteoffset + bitutil.BytesForBits(int64(b.bitoffset))
+}
+
+// Reset allows reusing a BitReader by setting a new reader and resetting the internal
+// state back to zeros.
+func (b *BitReader) Reset(r reader) {
+	b.reader = r
+	b.buffer = 0
+	b.byteoffset = 0
+	b.bitoffset = 0
+}
+
+// GetVlqInt reads a Vlq encoded int from the stream. The encoded value must start
+// at the beginning of a byte and this returns false if there weren't enough bytes
+// in the buffer or reader. This will call `ReadByte` which in turn retrieves byte
+// aligned values from the reader
+func (b *BitReader) GetVlqInt() (uint64, bool) {
+	tmp, err := binary.ReadUvarint(b)
+	if err != nil {
+		return 0, false
+	}
+	return tmp, true
+}
+
+// GetZigZagVlqInt reads a zigzag encoded integer, returning false if there weren't
+// enough bytes remaining.
+func (b *BitReader) GetZigZagVlqInt() (int64, bool) {
+	u, ok := b.GetVlqInt()
+	if !ok {
+		return 0, false
+	}
+
+	return int64(u>>1) ^ -int64(u&1), true
+}
+
+// ReadByte reads a single aligned byte from the underlying stream, or populating
+// error if there aren't enough bytes left.
+func (b *BitReader) ReadByte() (byte, error) {
+	var tmp byte
+	if ok := b.GetAligned(1, &tmp); !ok {
+		return 0, errors.New("failed to read byte")
+	}
+
+	return tmp, nil
+}
+
+// GetAligned reads nbytes from the underlying stream into the passed interface value.
+// Returning false if there aren't enough bytes remaining in the stream or if an invalid
+// type is passed. The bytes are read aligned to byte boundaries.
+//
+// v must be a pointer to a byte or sized uint type (*byte, *uint16, *uint32, *uint64).
+// encoded values are assumed to be little endian.
+func (b *BitReader) GetAligned(nbytes int, v interface{}) bool {
+	// figure out the number of bytes to represent v
+	typBytes := int(reflect.TypeOf(v).Elem().Size())
+	if nbytes > typBytes {
+		return false
+	}
+
+	bread := bitutil.BytesForBits(int64(b.bitoffset))
+
+	b.byteoffset += bread
+	n, err := b.reader.ReadAt(b.raw[:nbytes], b.byteoffset)
+	if err != nil && err != io.EOF {
+		return false
+	}
+	if n != nbytes {
+		return false
+	}
+	// zero pad the the bytes
+	memory.Set(b.raw[n:typBytes], 0)
+
+	switch v := v.(type) {
+	case *byte:
+		*v = b.raw[0]
+	case *uint64:
+		*v = binary.LittleEndian.Uint64(b.raw[:typBytes])
+	case *uint32:
+		*v = binary.LittleEndian.Uint32(b.raw[:typBytes])
+	case *uint16:
+		*v = binary.LittleEndian.Uint16(b.raw[:typBytes])
+	default:
+		return false
+	}
+
+	b.byteoffset += int64(nbytes)
+
+	b.bitoffset = 0
+	b.fillbuffer()
+	return true
+}
+
+// fillbuffer fills the uint64 buffer with bytes from the underlying stream
+func (b *BitReader) fillbuffer() error {
+	n, err := b.reader.ReadAt(b.raw[:], b.byteoffset)
+	if err != nil && n == 0 && err != io.EOF {
+		return err
+	}
+	for i := n; i < 8; i++ {
+		b.raw[i] = 0
+	}
+	b.buffer = binary.LittleEndian.Uint64(b.raw[:])
+	return nil
+}
+
+// next reads an integral value from the next bits in the buffer
+func (b *BitReader) next(bits uint) (v uint64, err error) {
+	v = trailingBits(b.buffer, b.bitoffset+bits) >> b.bitoffset
+	b.bitoffset += bits
+	// if we need more bits to get what was requested then refill the buffer
+	if b.bitoffset >= 64 {
+		b.byteoffset += 8
+		b.bitoffset -= 64
+		if err = b.fillbuffer(); err != nil {
+			return 0, err
+		}
+		v |= trailingBits(b.buffer, b.bitoffset) << (bits - b.bitoffset)
+	}
+	return
+}
+
+// GetBatchIndex is like GetBatch but for IndexType (used for dictionary decoding)
+func (b *BitReader) GetBatchIndex(bits uint, out []IndexType) (i int, err error) {
+	// IndexType is a 32-bit value so bits must be less than 32 when unpacking
+	// values using the bitreader.
+	if bits > 32 {
+		return 0, errors.New("must be 32 bits or less per read")
+	}
+
+	var val uint64
+
+	length := len(out)
+	// if we're not currently byte-aligned, read bits until we are byte-aligned.
+	for ; i < length && b.bitoffset != 0; i++ {
+		val, err = b.next(bits)
+		out[i] = IndexType(val)
+		if err != nil {
+			return
+		}
+	}
+
+	b.reader.Seek(b.byteoffset, io.SeekStart)
+	// grab as many 32 byte chunks as possible in one shot
+	if i < length { // IndexType should be a 32 bit value so we can do quick unpacking right into the output
+		numUnpacked := unpack32(b.reader, (*(*[]uint32)(unsafe.Pointer(&out)))[i:], int(bits))
+		i += numUnpacked
+		b.byteoffset += int64(numUnpacked * int(bits) / 8)
+	}
+
+	// re-fill our buffer just in case.
+	b.fillbuffer()
+	// grab the remaining values that aren't 32 byte aligned
+	for ; i < length; i++ {
+		val, err = b.next(bits)
+		out[i] = IndexType(val)
+		if err != nil {
+			break
+		}
+	}
+	return
+}
+
+// GetBatchBools is like GetBatch but optimized for reading bits as boolean values
+func (b *BitReader) GetBatchBools(out []bool) (int, error) {
+	bits := uint(1)
+	length := len(out)
+
+	i := 0
+	// read until we are byte-aligned
+	for ; i < length && b.bitoffset != 0; i++ {
+		val, err := b.next(bits)
+		out[i] = val != 0
+		if err != nil {
+			return i, err
+		}
+	}
+
+	b.reader.Seek(b.byteoffset, io.SeekStart)
+	buf := arrow.Uint32Traits.CastToBytes(b.unpackBuf[:])
+	blen := buflen * 8
+	for i < length {
+		// grab byte-aligned bits in a loop since it's more efficient than going
+		// bit by bit when you can grab 8 bools at a time.
+		unpackSize := MinInt(blen, length-i) / 8 * 8
+		n, err := b.reader.Read(buf[:bitutil.BytesForBits(int64(unpackSize))])
+		if err != nil {
+			return i, err
+		}
+		BytesToBools(buf[:n], out[i:])
+		i += unpackSize
+		b.byteoffset += int64(n)
+	}
+
+	b.fillbuffer()
+	// grab the trailing bits
+	for ; i < length; i++ {
+		val, err := b.next(bits)
+		out[i] = val != 0
+		if err != nil {
+			return i, err
+		}
+	}
+
+	return i, nil
+}
+
+// GetBatch fills out by decoding values repeated from the stream that are encoded
+// using bits as the number of bits per value. The values are expected to be bit packed
+// so we will unpack the values to populate.
+func (b *BitReader) GetBatch(bits uint, out []uint64) (int, error) {
+	// since we're unpacking into uint64 values, we can't support bits being
+	// larger than 64 here as that's the largest size value we're reading
+	if bits > 64 {
+		return 0, errors.New("must be 64 bits or less per read")
+	}
+
+	length := len(out)
+
+	i := 0
+	// read until we are byte aligned
+	for ; i < length && b.bitoffset != 0; i++ {
+		val, err := b.next(bits)
+		out[i] = val
+		if err != nil {
+			return i, err
+		}
+	}
+
+	b.reader.Seek(b.byteoffset, io.SeekStart)
+	for i < length {
+		// unpack groups of 32 bytes at a time into a buffer since it's more efficient
+		unpackSize := MinInt(buflen, length-i)
+		numUnpacked := unpack32(b.reader, b.unpackBuf[:unpackSize], int(bits))
+		if numUnpacked == 0 {
+			break
+		}
+
+		for k := 0; k < numUnpacked; k++ {
+			out[i+k] = uint64(b.unpackBuf[k])
+		}
+		i += numUnpacked
+		b.byteoffset += int64(numUnpacked * int(bits) / 8)
+	}
+
+	b.fillbuffer()
+	// and then the remaining trailing values
+	for ; i < length; i++ {
+		val, err := b.next(bits)
+		out[i] = val
+		if err != nil {
+			return i, err
+		}
+	}
+
+	return i, nil
+}
+
+// GetValue returns a single value that is bit packed using width as the number of bits
+// and returns false if there weren't enough bits remaining.
+func (b *BitReader) GetValue(width int) (uint64, bool) {
+	v := make([]uint64, 1)
+	n, _ := b.GetBatch(uint(width), v)
+	return v[0], n == 1
+}
diff --git a/go/parquet/internal/utils/bit_reader_test.go b/go/parquet/internal/utils/bit_reader_test.go
new file mode 100644
index 00000000000..4b246e13fc2
--- /dev/null
+++ b/go/parquet/internal/utils/bit_reader_test.go
@@ -0,0 +1,619 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"math/bits"
+	"strconv"
+	"testing"
+
+	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/arrow/array"
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/arrow/memory"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/suite"
+	"golang.org/x/exp/rand"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+func TestBitWriter(t *testing.T) {
+	buf := make([]byte, 8)
+	bw := utils.NewBitWriter(utils.NewWriterAtBuffer(buf))
+
+	for i := 0; i < 8; i++ {
+		assert.Nil(t, bw.WriteValue(uint64(i%2), 1))
+	}
+	bw.Flush(false)
+
+	assert.Equal(t, byte(0xAA), buf[0])
+
+	for i := 0; i < 8; i++ {
+		switch i {
+		case 0, 1, 4, 5:
+			assert.Nil(t, bw.WriteValue(0, 1))
+		default:
+			assert.Nil(t, bw.WriteValue(1, 1))
+		}
+	}
+	bw.Flush(false)
+
+	assert.Equal(t, byte(0xAA), buf[0])
+	assert.Equal(t, byte(0xCC), buf[1])
+}
+
+func TestBitReader(t *testing.T) {
+	buf := []byte{0xAA, 0xCC} // 0b10101010 0b11001100
+
+	reader := utils.NewBitReader(bytes.NewReader(buf))
+	for i := 0; i < 8; i++ {
+		val, ok := reader.GetValue(1)
+		assert.True(t, ok)
+		assert.Equalf(t, (i%2) != 0, val != 0, "val: %d, i: %d", val, i)
+	}
+
+	for i := 0; i < 8; i++ {
+		val, ok := reader.GetValue(1)
+		assert.True(t, ok)
+		switch i {
+		case 0, 1, 4, 5:
+			assert.EqualValues(t, 0, val)
+		default:
+			assert.EqualValues(t, 1, val)
+		}
+	}
+}
+
+func TestBitArrayVals(t *testing.T) {
+	tests := []struct {
+		name  string
+		nvals func(uint) int
+	}{
+		{"1 value", func(uint) int { return 1 }},
+		{"2 values", func(uint) int { return 2 }},
+		{"larger", func(w uint) int {
+			if w < 12 {
+				return 1 << w
+			}
+			return 4096
+		}},
+		{"1024 values", func(uint) int { return 1024 }},
+	}
+
+	for width := uint(1); width < 32; width++ {
+		t.Run(fmt.Sprintf("BitWriter Width %d", width), func(t *testing.T) {
+			for _, tt := range tests {
+				t.Run(tt.name, func(t *testing.T) {
+					var (
+						nvals        = tt.nvals(width)
+						mod   uint64 = 1
+					)
+					l := bitutil.BytesForBits(int64(int(width) * nvals))
+					assert.Greater(t, l, int64(0))
+
+					if width != 64 {
+						mod = uint64(1) << width
+					}
+
+					buf := make([]byte, l)
+					bw := utils.NewBitWriter(utils.NewWriterAtBuffer(buf))
+					for i := 0; i < nvals; i++ {
+						assert.Nil(t, bw.WriteValue(uint64(i)%mod, width))
+					}
+					bw.Flush(false)
+					assert.Equal(t, l, int64(bw.Written()))
+
+					br := utils.NewBitReader(bytes.NewReader(buf))
+					for i := 0; i < nvals; i++ {
+						val, ok := br.GetValue(int(width))
+						assert.True(t, ok)
+						assert.Equal(t, uint64(i)%mod, val)
+					}
+				})
+			}
+		})
+	}
+}
+
+func TestMixedValues(t *testing.T) {
+	const buflen = 1024
+	buf := make([]byte, buflen)
+	parity := true
+
+	bw := utils.NewBitWriter(utils.NewWriterAtBuffer(buf))
+	for i := 0; i < buflen; i++ {
+		if i%2 == 0 {
+			v := uint64(1)
+			if !parity {
+				v = 0
+			}
+			assert.Nil(t, bw.WriteValue(v, 1))
+			parity = !parity
+		} else {
+			assert.Nil(t, bw.WriteValue(uint64(i), 10))
+		}
+	}
+	bw.Flush(false)
+
+	parity = true
+	br := utils.NewBitReader(bytes.NewReader(buf))
+	for i := 0; i < buflen; i++ {
+		if i%2 == 0 {
+			val, ok := br.GetValue(1)
+			assert.True(t, ok)
+			exp := uint64(1)
+			if !parity {
+				exp = 0
+			}
+			assert.Equal(t, exp, val)
+			parity = !parity
+		} else {
+			val, ok := br.GetValue(10)
+			assert.True(t, ok)
+			assert.Equal(t, uint64(i), val)
+		}
+	}
+}
+
+func TestZigZag(t *testing.T) {
+	testvals := []int64{0, 1, 1234, -1, -1234, math.MaxInt32, -math.MaxInt32}
+	for _, v := range testvals {
+		t.Run(strconv.Itoa(int(v)), func(t *testing.T) {
+			var buf [binary.MaxVarintLen64]byte
+			wrtr := utils.NewBitWriter(utils.NewWriterAtBuffer(buf[:]))
+			assert.True(t, wrtr.WriteZigZagVlqInt(v))
+			wrtr.Flush(false)
+
+			rdr := utils.NewBitReader(bytes.NewReader(buf[:]))
+			val, ok := rdr.GetZigZagVlqInt()
+			assert.True(t, ok)
+			assert.EqualValues(t, v, val)
+		})
+	}
+}
+
+const buflen = 1024
+
+type RLETestSuite struct {
+	suite.Suite
+
+	expectedBuf []byte
+	values      []uint64
+}
+
+type RLERandomSuite struct {
+	suite.Suite
+}
+
+func TestRLE(t *testing.T) {
+	suite.Run(t, new(RLETestSuite))
+}
+
+func TestRleRandom(t *testing.T) {
+	suite.Run(t, new(RLERandomSuite))
+}
+
+func (r *RLETestSuite) ValidateRle(vals []uint64, width int, expected []byte, explen int) {
+	const buflen = 64 * 1024
+	buf := make([]byte, buflen)
+
+	r.Run("test encode", func() {
+		r.LessOrEqual(explen, buflen)
+
+		enc := utils.NewRleEncoder(utils.NewWriterAtBuffer(buf), width)
+		for _, val := range vals {
+			r.True(enc.Put(val))
+		}
+		encoded := enc.Flush()
+		if explen != -1 {
+			r.Equal(explen, encoded)
+		}
+
+		if expected != nil {
+			r.Equal(expected, buf[:encoded])
+		}
+	})
+
+	r.Run("decode read", func() {
+		dec := utils.NewRleDecoder(bytes.NewReader(buf), width)
+		for _, val := range vals {
+			v, ok := dec.GetValue()
+			r.True(ok)
+			r.Equal(val, v)
+		}
+	})
+
+	r.Run("decode batch read", func() {
+		dec := utils.NewRleDecoder(bytes.NewReader(buf), width)
+		check := make([]uint64, len(vals))
+		r.Equal(len(vals), dec.GetBatch(check))
+		r.Equal(vals, check)
+	})
+}
+
+func (r *RLETestSuite) SetupTest() {
+	r.expectedBuf = make([]byte, 0, buflen)
+	r.values = make([]uint64, 100)
+}
+
+func (r *RLETestSuite) Test50Zeros50Ones() {
+	for i := 0; i < 50; i++ {
+		r.values[i] = 0
+	}
+	for i := 50; i < 100; i++ {
+		r.values[i] = 1
+	}
+
+	r.expectedBuf = append(r.expectedBuf, []byte{50 << 1, 0, 50 << 1, 1}...)
+	for width := 1; width <= 8; width++ {
+		r.Run(fmt.Sprintf("bitwidth: %d", width), func() {
+			r.ValidateRle(r.values, width, r.expectedBuf, 4)
+		})
+	}
+
+	for width := 9; width <= 32; width++ {
+		r.Run(fmt.Sprintf("bitwidth: %d", width), func() {
+			r.ValidateRle(r.values, width, nil, int(2*(1+bitutil.BytesForBits(int64(width)))))
+		})
+	}
+}
+
+func (r *RLETestSuite) Test100ZerosOnesAlternating() {
+	for idx := range r.values {
+		r.values[idx] = uint64(idx % 2)
+	}
+
+	ngroups := bitutil.BytesForBits(100)
+	r.expectedBuf = r.expectedBuf[:ngroups+1]
+	r.expectedBuf[0] = byte(ngroups<<1) | 1
+	for i := 1; i <= 100/8; i++ {
+		r.expectedBuf[i] = 0xAA
+	}
+	r.expectedBuf[100/8+1] = 0x0A
+
+	r.Run("width: 1", func() {
+		r.ValidateRle(r.values, 1, r.expectedBuf, int(1+ngroups))
+	})
+	for width := 2; width < 32; width++ {
+		r.Run(fmt.Sprintf("width: %d", width), func() {
+			nvalues := bitutil.BytesForBits(100) * 8
+			r.ValidateRle(r.values, width, nil, int(1+bitutil.BytesForBits(int64(width)*nvalues)))
+		})
+	}
+}
+
+func (r *RLETestSuite) Test16BitValues() {
+	// confirm encoded values are little endian
+	r.values = r.values[:28]
+	for i := 0; i < 16; i++ {
+		r.values[i] = 0x55aa
+	}
+	for i := 16; i < 28; i++ {
+		r.values[i] = 0xaa55
+	}
+
+	r.expectedBuf = append(r.expectedBuf, []byte{
+		16 << 1, 0xaa, 0x55, 12 << 1, 0x55, 0xaa,
+	}...)
+
+	r.ValidateRle(r.values, 16, r.expectedBuf, 6)
+}
+
+func (r *RLETestSuite) Test32BitValues() {
+	// confirm encoded values are little endian
+	r.values = r.values[:28]
+	for i := 0; i < 16; i++ {
+		r.values[i] = 0x555aaaa5
+	}
+	for i := 16; i < 28; i++ {
+		r.values[i] = 0x5aaaa555
+	}
+
+	r.expectedBuf = append(r.expectedBuf, []byte{
+		16 << 1, 0xa5, 0xaa, 0x5a, 0x55,
+		12 << 1, 0x55, 0xa5, 0xaa, 0x5a,
+	}...)
+
+	r.ValidateRle(r.values, 32, r.expectedBuf, 10)
+}
+
+func (r *RLETestSuite) TestRleValues() {
+	tests := []struct {
+		name  string
+		nvals int
+		val   int
+	}{
+		{"1", 1, -1},
+		{"1024", 1024, -1},
+		{"1024 0", 1024, 0},
+		{"1024 1", 1024, 1},
+	}
+
+	for width := 1; width <= 32; width++ {
+		r.Run(fmt.Sprintf("width %d", width), func() {
+			for _, tt := range tests {
+				r.Run(tt.name, func() {
+
+					var mod uint64 = 1
+					if width != 64 {
+						mod = uint64(1) << width
+					}
+
+					r.values = r.values[:0]
+
+					for v := 0; v < tt.nvals; v++ {
+						if tt.val != -1 {
+							r.values = append(r.values, uint64(tt.val))
+						} else {
+							r.values = append(r.values, uint64(v)%mod)
+						}
+					}
+					r.ValidateRle(r.values, width, nil, -1)
+				})
+			}
+		})
+	}
+}
+
+// Test that writes out a repeated group and then a literal group
+// but flush before finishing
+func (r *RLETestSuite) TestBitRleFlush() {
+	vals := make([]uint64, 0, 16)
+	for i := 0; i < 16; i++ {
+		vals = append(vals, 1)
+	}
+	vals = append(vals, 0)
+	r.ValidateRle(vals, 1, nil, -1)
+	vals = append(vals, 1)
+	r.ValidateRle(vals, 1, nil, -1)
+	vals = append(vals, 1)
+	r.ValidateRle(vals, 1, nil, -1)
+	vals = append(vals, 1)
+	r.ValidateRle(vals, 1, nil, -1)
+}
+
+func (r *RLETestSuite) TestRepeatedPattern() {
+	r.values = r.values[:0]
+	const minrun = 1
+	const maxrun = 32
+
+	for i := minrun; i <= maxrun; i++ {
+		v := i % 2
+		for j := 0; j < i; j++ {
+			r.values = append(r.values, uint64(v))
+		}
+	}
+
+	// and go back down again
+	for i := maxrun; i >= minrun; i-- {
+		v := i % 2
+		for j := 0; j < i; j++ {
+			r.values = append(r.values, uint64(v))
+		}
+	}
+
+	r.ValidateRle(r.values, 1, nil, -1)
+}
+
+func TestBitWidthZeroRepeated(t *testing.T) {
+	buf := make([]byte, 1)
+	const nvals = 15
+	buf[0] = nvals << 1 // repeated indicator byte
+	dec := utils.NewRleDecoder(bytes.NewReader(buf), 0)
+	for i := 0; i < nvals; i++ {
+		val, ok := dec.GetValue()
+		assert.True(t, ok)
+		assert.Zero(t, val)
+	}
+	_, ok := dec.GetValue()
+	assert.False(t, ok)
+}
+
+func TestBitWidthZeroLiteral(t *testing.T) {
+	const ngroups = 4
+	buf := []byte{4<<1 | 1}
+	dec := utils.NewRleDecoder(bytes.NewReader(buf), 0)
+	const nvals = ngroups * 8
+	for i := 0; i < nvals; i++ {
+		val, ok := dec.GetValue()
+		assert.True(t, ok)
+		assert.Zero(t, val)
+	}
+	_, ok := dec.GetValue()
+	assert.False(t, ok)
+}
+
+func (r *RLERandomSuite) checkRoundTrip(vals []uint64, width int) bool {
+	const buflen = 64 * 1024
+	buf := make([]byte, buflen)
+	var encoded int
+
+	res := r.Run("encode values", func() {
+		enc := utils.NewRleEncoder(utils.NewWriterAtBuffer(buf), width)
+		for idx, val := range vals {
+			r.Require().Truef(enc.Put(val), "encoding idx: %d", idx)
+		}
+		encoded = enc.Flush()
+	})
+
+	res = res && r.Run("decode individual", func() {
+		dec := utils.NewRleDecoder(bytes.NewReader(buf[:encoded]), width)
+		for idx, val := range vals {
+			out, ok := dec.GetValue()
+			r.True(ok)
+			r.Require().Equalf(out, val, "mismatch idx: %d", idx)
+		}
+	})
+
+	res = res && r.Run("batch decode", func() {
+		dec := utils.NewRleDecoder(bytes.NewReader(buf[:encoded]), width)
+		read := make([]uint64, len(vals))
+		r.Require().Equal(len(vals), dec.GetBatch(read))
+		r.Equal(vals, read)
+	})
+
+	return res
+}
+
+func (r *RLERandomSuite) checkRoundTripSpaced(vals array.Interface, width int) {
+	nvalues := vals.Len()
+	bufsize := utils.MaxBufferSize(width, nvalues)
+
+	buffer := make([]byte, bufsize)
+	encoder := utils.NewRleEncoder(utils.NewWriterAtBuffer(buffer), width)
+
+	switch v := vals.(type) {
+	case *array.Int32:
+		for i := 0; i < v.Len(); i++ {
+			if v.IsValid(i) {
+				r.Require().True(encoder.Put(uint64(v.Value(i))))
+			}
+		}
+	}
+
+	encodedSize := encoder.Flush()
+
+	// verify batch read
+	decoder := utils.NewRleDecoder(bytes.NewReader(buffer[:encodedSize]), width)
+	valuesRead := make([]uint64, nvalues)
+	val, err := decoder.GetBatchSpaced(valuesRead, vals.NullN(), vals.NullBitmapBytes(), int64(vals.Data().Offset()))
+	r.NoError(err)
+	r.EqualValues(nvalues, val)
+
+	switch v := vals.(type) {
+	case *array.Int32:
+		for i := 0; i < nvalues; i++ {
+			if vals.IsValid(i) {
+				r.EqualValues(v.Value(i), valuesRead[i])
+			}
+		}
+	}
+}
+
+func (r *RLERandomSuite) TestRandomSequences() {
+	const niters = 50
+	const ngroups = 1000
+	const maxgroup = 16
+
+	values := make([]uint64, ngroups+maxgroup)
+	seed := rand.Uint64() ^ (rand.Uint64() << 32)
+	gen := rand.New(rand.NewSource(seed))
+
+	for itr := 0; itr < niters; itr++ {
+		parity := false
+		values = values[:0]
+
+		for i := 0; i < ngroups; i++ {
+			groupsize := gen.Intn(19) + 1
+			if groupsize > maxgroup {
+				groupsize = 1
+			}
+
+			v := uint64(0)
+			if parity {
+				v = 1
+			}
+			for j := 0; j < groupsize; j++ {
+				values = append(values, v)
+			}
+			parity = !parity
+		}
+		r.Require().Truef(r.checkRoundTrip(values, bits.Len(uint(len(values)))), "failing seed: %d", seed)
+	}
+}
+
+type RandomArrayGenerator struct {
+	seed     uint64
+	extra    uint64
+	src      rand.Source
+	seedRand *rand.Rand
+}
+
+func NewRandomArrayGenerator(seed uint64) RandomArrayGenerator {
+	src := rand.NewSource(seed)
+	return RandomArrayGenerator{seed, 0, src, rand.New(src)}
+}
+
+func (r *RandomArrayGenerator) generateBitmap(buffer []byte, n int64, prob float64) int64 {
+	count := int64(0)
+	r.extra++
+
+	dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)}
+	for i := int(0); int64(i) < n; i++ {
+		if dist.Rand() != float64(0.0) {
+			bitutil.SetBit(buffer, i)
+		} else {
+			count++
+		}
+	}
+
+	return count
+}
+
+func (r *RandomArrayGenerator) Int32(size int64, min, max int32, prob float64) array.Interface {
+	buffers := make([]*memory.Buffer, 2)
+	nullCount := int64(0)
+
+	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[0].Resize(int(bitutil.BytesForBits(size)))
+	nullCount = r.generateBitmap(buffers[0].Bytes(), size, prob)
+
+	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
+	buffers[1].Resize(int(size * int64(arrow.Int32SizeBytes)))
+
+	r.extra++
+	dist := rand.New(rand.NewSource(r.seed + r.extra))
+	out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes())
+	for i := int64(0); i < size; i++ {
+		out[i] = int32(dist.Int31n(max-min+1)) + min
+	}
+
+	return array.NewInt32Data(array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullCount), 0))
+}
+
+func (r *RLERandomSuite) TestGetBatchSpaced() {
+	seed := uint64(1337)
+
+	rng := NewRandomArrayGenerator(seed)
+
+	tests := []struct {
+		name     string
+		max      int32
+		size     int64
+		nullProb float64
+		bitWidth int
+	}{
+		{"all ones 0.01 nullprob width 1", 1, 100000, 0.01, 1},
+		{"all ones 0.1 nullprob width 1", 1, 100000, 0.1, 1},
+		{"all ones 0.5 nullprob width 1", 1, 100000, 0.5, 1},
+		{"max 4 0.05 nullprob width 3", 4, 100000, 0.05, 3},
+		{"max 100 0.05 nullprob width 7", 100, 100000, 0.05, 7},
+	}
+
+	for _, tt := range tests {
+		r.Run(tt.name, func() {
+			arr := rng.Int32(tt.size, 0, tt.max, tt.nullProb)
+			r.checkRoundTripSpaced(arr, tt.bitWidth)
+			r.checkRoundTripSpaced(array.NewSlice(arr, 1, int64(arr.Len())), tt.bitWidth)
+		})
+	}
+}
diff --git a/go/parquet/internal/utils/bit_run_reader.go b/go/parquet/internal/utils/bit_run_reader.go
new file mode 100644
index 00000000000..2c704cd6a5f
--- /dev/null
+++ b/go/parquet/internal/utils/bit_run_reader.go
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+	"unsafe"
+
+	"github.com/apache/arrow/go/arrow"
+	"github.com/apache/arrow/go/arrow/bitutil"
+)
+
+// BitRun represents a run of bits with the same value of length Len
+// with Set representing if the group of bits were 1 or 0.
+type BitRun struct {
+	Len int64
+	Set bool
+}
+
+// BitRunReader is an interface that is usable by multiple callers to provide
+// multiple types of bit run readers such as a reverse reader and so on.
+//
+// It's a convenience interface for counting contiguous set/unset bits in a bitmap.
+// In places where BitBlockCounter can be used, then it would be preferred to use that
+// as it would be faster than using BitRunReader.
+type BitRunReader interface {
+	NextRun() BitRun
+}
+
+func (b BitRun) String() string {
+	return fmt.Sprintf("{Length: %d, set=%t}", b.Len, b.Set)
+}
+
+type bitRunReader struct {
+	bitmap       []byte
+	pos          int64
+	length       int64
+	word         uint64
+	curRunBitSet bool
+}
+
+// NewBitRunReader returns a reader for the given bitmap, offset and length that
+// grabs runs of the same value bit at a time for easy iteration.
+func NewBitRunReader(bitmap []byte, offset int64, length int64) BitRunReader {
+	ret := &bitRunReader{
+		bitmap: bitmap[offset/8:],
+		pos:    offset % 8,
+		length: (offset % 8) + length,
+	}
+
+	if length == 0 {
+		return ret
+	}
+
+	ret.curRunBitSet = bitutil.BitIsNotSet(bitmap, int(offset))
+	bitsRemaining := length + ret.pos
+	ret.loadWord(bitsRemaining)
+	ret.word = ret.word &^ LeastSignificantBitMask(ret.pos)
+	return ret
+}
+
+// NextRun returns a new BitRun containing the number of contiguous bits with the
+// same value. Len == 0 indicates the end of the bitmap.
+func (b *bitRunReader) NextRun() BitRun {
+	if b.pos >= b.length {
+		return BitRun{0, false}
+	}
+
+	// This implementation relies on a efficient implementations of
+	// CountTrailingZeros and assumes that runs are more often then
+	// not.  The logic is to incrementally find the next bit change
+	// from the current position.  This is done by zeroing all
+	// bits in word_ up to position_ and using the TrailingZeroCount
+	// to find the index of the next set bit.
+
+	// The runs alternate on each call, so flip the bit.
+	b.curRunBitSet = !b.curRunBitSet
+
+	start := b.pos
+	startOffset := start & 63
+
+	// Invert the word for proper use of CountTrailingZeros and
+	// clear bits so CountTrailingZeros can do it magic.
+	b.word = ^b.word &^ LeastSignificantBitMask(startOffset)
+
+	// Go  forward until the next change from unset to set.
+	newbits := int64(bits.TrailingZeros64(b.word)) - startOffset
+	b.pos += newbits
+
+	if IsMultipleOf64(b.pos) && b.pos < b.length {
+		b.advanceUntilChange()
+	}
+	return BitRun{b.pos - start, b.curRunBitSet}
+}
+
+func (b *bitRunReader) advanceUntilChange() {
+	newbits := int64(0)
+	for {
+		b.bitmap = b.bitmap[arrow.Uint64SizeBytes:]
+		b.loadNextWord()
+		newbits = int64(bits.TrailingZeros64(b.word))
+		b.pos += newbits
+		if !IsMultipleOf64(b.pos) || b.pos >= b.length || newbits <= 0 {
+			break
+		}
+	}
+}
+
+func (b *bitRunReader) loadNextWord() {
+	b.loadWord(b.length - b.pos)
+}
+
+func (b *bitRunReader) loadWord(bitsRemaining int64) {
+	b.word = 0
+	if bitsRemaining >= 64 {
+		b.word = binary.LittleEndian.Uint64(b.bitmap)
+	} else {
+		nbytes := bitutil.BytesForBits(bitsRemaining)
+		wordptr := (*(*[8]byte)(unsafe.Pointer(&b.word)))[:]
+		copy(wordptr, b.bitmap[:nbytes])
+
+		bitutil.SetBitTo(wordptr, int(bitsRemaining), bitutil.BitIsNotSet(wordptr, int(bitsRemaining-1)))
+	}
+
+	// Two cases:
+	//   1. For unset, CountTrailingZeros works naturally so we don't
+	//   invert the word.
+	//   2. Otherwise invert so we can use CountTrailingZeros.
+	if b.curRunBitSet {
+		b.word = ^b.word
+	}
+}
diff --git a/go/parquet/internal/utils/bit_run_reader_test.go b/go/parquet/internal/utils/bit_run_reader_test.go
new file mode 100644
index 00000000000..e34026ed816
--- /dev/null
+++ b/go/parquet/internal/utils/bit_run_reader_test.go
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+	"math/bits"
+	"testing"
+	"unsafe"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/arrow/endian"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/stretchr/testify/assert"
+)
+
+var toLittleEndian func(uint64) uint64
+
+func init() {
+	if endian.IsBigEndian {
+		toLittleEndian = bits.ReverseBytes64
+	} else {
+		toLittleEndian = func(in uint64) uint64 { return in }
+	}
+}
+
+func TestBitRunReaderZeroLength(t *testing.T) {
+	reader := utils.NewBitRunReader(nil, 0, 0)
+	assert.Zero(t, reader.NextRun().Len)
+}
+
+func bitmapFromSlice(vals []int, bitOffset int64) []byte {
+	out := make([]byte, int(bitutil.BytesForBits(int64(len(vals))+bitOffset)))
+	writer := utils.NewBitmapWriter(out, bitOffset, int64(len(vals)))
+	for _, val := range vals {
+		if val == 1 {
+			writer.Set()
+		} else {
+			writer.Clear()
+		}
+		writer.Next()
+	}
+	writer.Finish()
+
+	return out
+}
+
+func TestBitRunReader(t *testing.T) {
+	tests := []struct {
+		name     string
+		val      []int
+		bmvec    []int
+		offset   int64
+		len      int64
+		expected []utils.BitRun
+	}{
+		{"normal operation",
+			[]int{5, 0, 7, 1, 3, 0, 25, 1, 21, 0, 26, 1, 130, 0, 65, 1},
+			[]int{1, 0, 1},
+			0, -1,
+			[]utils.BitRun{
+				{1, true},
+				{1, false},
+				{1, true},
+				{5, false},
+				{7, true},
+				{3, false},
+				{25, true},
+				{21, false},
+				{26, true},
+				{130, false},
+				{65, true},
+			},
+		},
+		{"truncated at word", []int{7, 1, 58, 0}, []int{}, 1, 63,
+			[]utils.BitRun{{6, true}, {57, false}},
+		},
+		{"truncated within word multiple of 8 bits",
+			[]int{7, 1, 5, 0}, []int{}, 1, 7,
+			[]utils.BitRun{{6, true}, {1, false}},
+		},
+		{"truncated within word", []int{37 + 40, 0, 23, 1}, []int{}, 37, 53,
+			[]utils.BitRun{{40, false}, {13, true}},
+		},
+		{"truncated multiple words", []int{5, 0, 30, 1, 95, 0}, []int{1, 0, 1},
+			5, (3 + 5 + 30 + 95) - (5 + 3), []utils.BitRun{{3, false}, {30, true}, {92, false}},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			bmvec := tt.bmvec
+
+			for i := 0; i < len(tt.val); i += 2 {
+				for j := 0; j < tt.val[i]; j++ {
+					bmvec = append(bmvec, tt.val[i+1])
+				}
+			}
+
+			bitmap := bitmapFromSlice(bmvec, 0)
+			length := int64(len(bmvec)) - tt.offset
+			if tt.len != -1 {
+				length = tt.len
+			}
+			reader := utils.NewBitRunReader(bitmap, tt.offset, length)
+
+			results := make([]utils.BitRun, 0)
+			for {
+				results = append(results, reader.NextRun())
+				if results[len(results)-1].Len == 0 {
+					break
+				}
+			}
+			assert.Zero(t, results[len(results)-1].Len)
+			results = results[:len(results)-1]
+
+			assert.Equal(t, tt.expected, results)
+		})
+	}
+}
+
+func TestBitRunReaderAllFirstByteCombos(t *testing.T) {
+	for offset := int64(0); offset < 8; offset++ {
+		for x := int64(0); x < (1<<8)-1; x++ {
+			bits := int64(toLittleEndian(uint64(x)))
+			reader := utils.NewBitRunReader((*(*[8]byte)(unsafe.Pointer(&bits)))[:], offset, 8-offset)
+
+			results := make([]utils.BitRun, 0)
+			for {
+				results = append(results, reader.NextRun())
+				if results[len(results)-1].Len == 0 {
+					break
+				}
+			}
+			assert.Zero(t, results[len(results)-1].Len)
+			results = results[:len(results)-1]
+
+			var sum int64
+			for _, r := range results {
+				sum += r.Len
+			}
+			assert.EqualValues(t, sum, 8-offset)
+		}
+	}
+}
diff --git a/go/parquet/internal/utils/bit_set_run_reader.go b/go/parquet/internal/utils/bit_set_run_reader.go
new file mode 100644
index 00000000000..0dd6eeba73a
--- /dev/null
+++ b/go/parquet/internal/utils/bit_set_run_reader.go
@@ -0,0 +1,345 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"math/bits"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+)
+
+// IsMultipleOf64 returns whether v is a multiple of 64.
+func IsMultipleOf64(v int64) bool { return v&63 == 0 }
+
+// LeastSignificantBitMask returns a bit mask to return the least significant
+// bits for a value starting from the bit index passed in. ie: if you want a
+// mask for the 4 least significant bits, you call LeastSignificantBitMask(4)
+func LeastSignificantBitMask(index int64) uint64 {
+	return (uint64(1) << index) - 1
+}
+
+// SetBitRun describes a run of contiguous set bits in a bitmap with Pos being
+// the starting position of the run and Length being the number of bits.
+type SetBitRun struct {
+	Pos    int64
+	Length int64
+}
+
+// AtEnd returns true if this bit run is the end of the set by checking
+// that the length is 0.
+func (s SetBitRun) AtEnd() bool {
+	return s.Length == 0
+}
+
+// Equal returns whether rhs is the same run as s
+func (s SetBitRun) Equal(rhs SetBitRun) bool {
+	return s.Pos == rhs.Pos && s.Length == rhs.Length
+}
+
+// SetBitRunReader is an interface for reading groups of contiguous set bits
+// from a bitmap. The interface allows us to create different reader implementations
+// that share the same interface easily such as a reverse set reader.
+type SetBitRunReader interface {
+	// NextRun will return the next run of contiguous set bits in the bitmap
+	NextRun() SetBitRun
+	// Reset allows re-using the reader by providing a new bitmap, offset and length. The arguments
+	// match the New function for the reader being used.
+	Reset([]byte, int64, int64)
+	// VisitSetBitRuns calls visitFn for each set in a loop starting from the current position
+	// it's roughly equivalent to simply looping, calling NextRun and calling visitFn on the run
+	// for each run.
+	VisitSetBitRuns(visitFn VisitFn) error
+}
+
+type baseSetBitRunReader struct {
+	bitmap     []byte
+	pos        int64
+	length     int64
+	remaining  int64
+	curWord    uint64
+	curNumBits int32
+	reversed   bool
+
+	firstBit uint64
+}
+
+// NewSetBitRunReader returns a SetBitRunReader for the bitmap starting at startOffset which will read
+// numvalues bits.
+func NewSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
+	return newBaseSetBitRunReader(validBits, startOffset, numValues, false)
+}
+
+// NewReverseSetBitRunReader returns a SetBitRunReader like NewSetBitRunReader, except it will
+// return runs starting from the end of the bitmap until it reaches startOffset rather than starting
+// at startOffset and reading from there. The SetBitRuns will still operate the same, so Pos
+// will still be the position of the "left-most" bit of the run or the "start" of the run. It
+// just returns runs starting from the end instead of starting from the beginning.
+func NewReverseSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
+	return newBaseSetBitRunReader(validBits, startOffset, numValues, true)
+}
+
+func newBaseSetBitRunReader(bitmap []byte, startOffset, length int64, reverse bool) *baseSetBitRunReader {
+	ret := &baseSetBitRunReader{reversed: reverse}
+	ret.Reset(bitmap, startOffset, length)
+	return ret
+}
+
+func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) {
+	br.bitmap = bitmap
+	br.length = length
+	br.remaining = length
+	br.curNumBits = 0
+	br.curWord = 0
+
+	if !br.reversed {
+		br.pos = startOffset / 8
+		br.firstBit = 1
+
+		bitOffset := int8(startOffset % 8)
+		if length > 0 && bitOffset != 0 {
+			br.curNumBits = int32(MinInt(int(length), int(8-bitOffset)))
+			br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits))
+		}
+		return
+	}
+
+	br.pos = (startOffset + length) / 8
+	br.firstBit = uint64(0x8000000000000000)
+	endBitOffset := int8((startOffset + length) % 8)
+	if length > 0 && endBitOffset != 0 {
+		br.pos++
+		br.curNumBits = int32(MinInt(int(length), int(endBitOffset)))
+		br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits))
+	}
+}
+
+func (br *baseSetBitRunReader) consumeBits(word uint64, nbits int32) uint64 {
+	if br.reversed {
+		return word << nbits
+	}
+	return word >> nbits
+}
+
+func (br *baseSetBitRunReader) countFirstZeros(word uint64) int32 {
+	if br.reversed {
+		return int32(bits.LeadingZeros64(word))
+	}
+	return int32(bits.TrailingZeros64(word))
+}
+
+func (br *baseSetBitRunReader) loadPartial(bitOffset int8, numBits int64) uint64 {
+	var word [8]byte
+	nbytes := bitutil.BytesForBits(numBits)
+	if br.reversed {
+		br.pos -= nbytes
+		copy(word[8-nbytes:], br.bitmap[br.pos:br.pos+nbytes])
+		return (binary.LittleEndian.Uint64(word[:]) << bitOffset) &^ LeastSignificantBitMask(64-numBits)
+	}
+
+	copy(word[:], br.bitmap[br.pos:br.pos+nbytes])
+	br.pos += nbytes
+	return (binary.LittleEndian.Uint64(word[:]) >> bitOffset) & LeastSignificantBitMask(numBits)
+}
+
+func (br *baseSetBitRunReader) findCurrentRun() SetBitRun {
+	nzeros := br.countFirstZeros(br.curWord)
+	if nzeros >= br.curNumBits {
+		br.remaining -= int64(br.curNumBits)
+		br.curWord = 0
+		br.curNumBits = 0
+		return SetBitRun{0, 0}
+	}
+
+	br.curWord = br.consumeBits(br.curWord, nzeros)
+	br.curNumBits -= nzeros
+	br.remaining -= int64(nzeros)
+	pos := br.position()
+
+	numOnes := br.countFirstZeros(^br.curWord)
+	br.curWord = br.consumeBits(br.curWord, numOnes)
+	br.curNumBits -= numOnes
+	br.remaining -= int64(numOnes)
+	return SetBitRun{pos, int64(numOnes)}
+}
+
+func (br *baseSetBitRunReader) position() int64 {
+	if br.reversed {
+		return br.remaining
+	}
+	return br.length - br.remaining
+}
+
+func (br *baseSetBitRunReader) adjustRun(run SetBitRun) SetBitRun {
+	if br.reversed {
+		run.Pos -= run.Length
+	}
+	return run
+}
+
+func (br *baseSetBitRunReader) loadFull() (ret uint64) {
+	if br.reversed {
+		br.pos -= 8
+	}
+	ret = binary.LittleEndian.Uint64(br.bitmap[br.pos : br.pos+8])
+	if !br.reversed {
+		br.pos += 8
+	}
+	return
+}
+
+func (br *baseSetBitRunReader) skipNextZeros() {
+	for br.remaining >= 64 {
+		br.curWord = br.loadFull()
+		nzeros := br.countFirstZeros(br.curWord)
+		if nzeros < 64 {
+			br.curWord = br.consumeBits(br.curWord, nzeros)
+			br.curNumBits = 64 - nzeros
+			br.remaining -= int64(nzeros)
+			return
+		}
+		br.remaining -= 64
+	}
+	// run of zeros continues in last bitmap word
+	if br.remaining > 0 {
+		br.curWord = br.loadPartial(0, br.remaining)
+		br.curNumBits = int32(br.remaining)
+		nzeros := int32(MinInt(int(br.curNumBits), int(br.countFirstZeros(br.curWord))))
+		br.curWord = br.consumeBits(br.curWord, nzeros)
+		br.curNumBits -= nzeros
+		br.remaining -= int64(nzeros)
+	}
+}
+
+func (br *baseSetBitRunReader) countNextOnes() int64 {
+	var length int64
+	if ^br.curWord != 0 {
+		numOnes := br.countFirstZeros(^br.curWord)
+		br.remaining -= int64(numOnes)
+		br.curWord = br.consumeBits(br.curWord, numOnes)
+		br.curNumBits -= numOnes
+		if br.curNumBits != 0 {
+			return int64(numOnes)
+		}
+		length = int64(numOnes)
+	} else {
+		br.remaining -= 64
+		br.curNumBits = 0
+		length = 64
+	}
+
+	for br.remaining >= 64 {
+		br.curWord = br.loadFull()
+		numOnes := br.countFirstZeros(^br.curWord)
+		length += int64(numOnes)
+		br.remaining -= int64(numOnes)
+		if numOnes < 64 {
+			br.curWord = br.consumeBits(br.curWord, numOnes)
+			br.curNumBits = 64 - numOnes
+			return length
+		}
+	}
+
+	if br.remaining > 0 {
+		br.curWord = br.loadPartial(0, br.remaining)
+		br.curNumBits = int32(br.remaining)
+		numOnes := br.countFirstZeros(^br.curWord)
+		br.curWord = br.consumeBits(br.curWord, numOnes)
+		br.curNumBits -= numOnes
+		br.remaining -= int64(numOnes)
+		length += int64(numOnes)
+	}
+	return length
+}
+
+func (br *baseSetBitRunReader) NextRun() SetBitRun {
+	var (
+		pos    int64 = 0
+		length int64 = 0
+	)
+
+	if br.curNumBits != 0 {
+		run := br.findCurrentRun()
+		if run.Length != 0 && br.curNumBits != 0 {
+			return br.adjustRun(run)
+		}
+		pos = run.Pos
+		length = run.Length
+	}
+
+	if length == 0 {
+		// we didn't get any ones in curWord, so we can skip any zeros
+		// in the following words
+		br.skipNextZeros()
+		if br.remaining == 0 {
+			return SetBitRun{0, 0}
+		}
+		pos = br.position()
+	} else if br.curNumBits == 0 {
+		if br.remaining >= 64 {
+			br.curWord = br.loadFull()
+			br.curNumBits = 64
+		} else if br.remaining > 0 {
+			br.curWord = br.loadPartial(0, br.remaining)
+			br.curNumBits = int32(br.remaining)
+		} else {
+			return br.adjustRun(SetBitRun{pos, length})
+		}
+		if (br.curWord & br.firstBit) == 0 {
+			return br.adjustRun(SetBitRun{pos, length})
+		}
+	}
+
+	length += br.countNextOnes()
+	return br.adjustRun(SetBitRun{pos, length})
+}
+
+// VisitFn is a callback function for visiting runs of contiguous bits
+type VisitFn func(pos int64, length int64) error
+
+func (br *baseSetBitRunReader) VisitSetBitRuns(visitFn VisitFn) error {
+	for {
+		run := br.NextRun()
+		if run.Length == 0 {
+			break
+		}
+
+		if err := visitFn(run.Pos, run.Length); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// VisitSetBitRuns is just a convenience function for calling NewSetBitRunReader and then VisitSetBitRuns
+func VisitSetBitRuns(bitmap []byte, bitmapOffset int64, length int64, visitFn VisitFn) error {
+	if bitmap == nil {
+		return visitFn(0, length)
+	}
+	rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
+	for {
+		run := rdr.NextRun()
+		if run.Length == 0 {
+			break
+		}
+
+		if err := visitFn(run.Pos, run.Length); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/go/parquet/internal/utils/bit_set_run_reader_test.go b/go/parquet/internal/utils/bit_set_run_reader_test.go
new file mode 100644
index 00000000000..2165b77c346
--- /dev/null
+++ b/go/parquet/internal/utils/bit_set_run_reader_test.go
@@ -0,0 +1,276 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/stretchr/testify/suite"
+)
+
+func reverseAny(s interface{}) {
+	n := reflect.ValueOf(s).Len()
+	swap := reflect.Swapper(s)
+	for i, j := 0, n-1; i < j; i, j = i+1, j-1 {
+		swap(i, j)
+	}
+}
+
+type linearBitRunReader struct {
+	reader *utils.BitmapReader
+}
+
+func (l linearBitRunReader) NextRun() utils.BitRun {
+	r := utils.BitRun{0, l.reader.Set()}
+	for l.reader.Pos() < l.reader.Len() && l.reader.Set() == r.Set {
+		r.Len++
+		l.reader.Next()
+	}
+	return r
+}
+
+func bitmapFromString(s string) []byte {
+	maxLen := bitutil.BytesForBits(int64(len(s)))
+	ret := make([]byte, maxLen)
+	i := 0
+	for _, c := range s {
+		switch c {
+		case '0':
+			bitutil.ClearBit(ret, i)
+			i++
+		case '1':
+			bitutil.SetBit(ret, i)
+			i++
+		case ' ', '\t', '\r', '\n':
+		default:
+			panic("unexpected character for bitmap string")
+		}
+	}
+
+	actualLen := bitutil.BytesForBits(int64(i))
+	return ret[:actualLen]
+}
+
+func referenceBitRuns(data []byte, offset, length int64) (ret []utils.SetBitRun) {
+	ret = make([]utils.SetBitRun, 0)
+	reader := linearBitRunReader{utils.NewBitmapReader(data, offset, length)}
+	pos := int64(0)
+	for pos < length {
+		br := reader.NextRun()
+		if br.Set {
+			ret = append(ret, utils.SetBitRun{pos, br.Len})
+		}
+		pos += br.Len
+	}
+	return
+}
+
+type BitSetRunReaderSuite struct {
+	suite.Suite
+
+	testOffsets []int64
+}
+
+func TestBitSetRunReader(t *testing.T) {
+	suite.Run(t, new(BitSetRunReaderSuite))
+}
+
+func (br *BitSetRunReaderSuite) SetupSuite() {
+	br.testOffsets = []int64{0, 1, 6, 7, 8, 33, 63, 64, 65, 71}
+}
+
+func (br *BitSetRunReaderSuite) SetupTest() {
+	br.T().Parallel()
+}
+
+type Range struct {
+	Offset int64
+	Len    int64
+}
+
+func (r Range) EndOffset() int64 { return r.Offset + r.Len }
+
+func (br *BitSetRunReaderSuite) bufferTestRanges(buf []byte) []Range {
+	bufSize := int64(len(buf) * 8) // in bits
+	rg := make([]Range, 0)
+	for _, offset := range br.testOffsets {
+		for _, lenAdjust := range br.testOffsets {
+			length := utils.Min(bufSize-offset, lenAdjust)
+			br.GreaterOrEqual(length, int64(0))
+			rg = append(rg, Range{offset, length})
+			length = utils.Min(bufSize-offset, bufSize-lenAdjust)
+			br.GreaterOrEqual(length, int64(0))
+			rg = append(rg, Range{offset, length})
+		}
+	}
+	return rg
+}
+
+func (br *BitSetRunReaderSuite) assertBitRuns(buf []byte, start, length int64, expected []utils.SetBitRun) {
+	{
+		runs := make([]utils.SetBitRun, 0)
+		reader := utils.NewSetBitRunReader(buf, start, length)
+		for {
+			run := reader.NextRun()
+			if run.Length == 0 {
+				break
+			}
+			runs = append(runs, run)
+		}
+		br.Equal(expected, runs)
+	}
+	{
+		runs := make([]utils.SetBitRun, 0)
+		reader := utils.NewReverseSetBitRunReader(buf, start, length)
+		for {
+			run := reader.NextRun()
+			if run.Length == 0 {
+				break
+			}
+			runs = append(runs, run)
+		}
+		reverseAny(expected)
+		br.Equal(expected, runs)
+	}
+}
+
+func (br *BitSetRunReaderSuite) TestEmpty() {
+	for _, offset := range br.testOffsets {
+		br.assertBitRuns(nil, offset, 0, []utils.SetBitRun{})
+	}
+}
+
+func (br *BitSetRunReaderSuite) TestOneByte() {
+	buffer := bitmapFromString("01101101")
+	br.assertBitRuns(buffer, 0, 8, []utils.SetBitRun{
+		{1, 2}, {4, 2}, {7, 1},
+	})
+
+	for _, str := range []string{"01101101", "10110110", "00000000", "11111111"} {
+		buf := bitmapFromString(str)
+		for offset := int64(0); offset < 8; offset++ {
+			for length := int64(0); length <= 8-offset; length++ {
+				expected := referenceBitRuns(buf, offset, length)
+				br.assertBitRuns(buf, offset, length, expected)
+			}
+		}
+	}
+}
+
+func (br *BitSetRunReaderSuite) TestTiny() {
+	buf := bitmapFromString("11100011 10001110 00111000 11100011 10001110 00111000")
+
+	br.assertBitRuns(buf, 0, 48, []utils.SetBitRun{
+		{0, 3}, {6, 3}, {12, 3}, {18, 3}, {24, 3}, {30, 3}, {36, 3}, {42, 3},
+	})
+	br.assertBitRuns(buf, 0, 46, []utils.SetBitRun{
+		{0, 3}, {6, 3}, {12, 3}, {18, 3}, {24, 3}, {30, 3}, {36, 3}, {42, 3},
+	})
+	br.assertBitRuns(buf, 0, 45, []utils.SetBitRun{
+		{0, 3}, {6, 3}, {12, 3}, {18, 3}, {24, 3}, {30, 3}, {36, 3}, {42, 3},
+	})
+	br.assertBitRuns(buf, 0, 42, []utils.SetBitRun{
+		{0, 3}, {6, 3}, {12, 3}, {18, 3}, {24, 3}, {30, 3}, {36, 3},
+	})
+	br.assertBitRuns(buf, 3, 45, []utils.SetBitRun{
+		{3, 3}, {9, 3}, {15, 3}, {21, 3}, {27, 3}, {33, 3}, {39, 3},
+	})
+	br.assertBitRuns(buf, 3, 43, []utils.SetBitRun{
+		{3, 3}, {9, 3}, {15, 3}, {21, 3}, {27, 3}, {33, 3}, {39, 3},
+	})
+	br.assertBitRuns(buf, 3, 42, []utils.SetBitRun{
+		{3, 3}, {9, 3}, {15, 3}, {21, 3}, {27, 3}, {33, 3}, {39, 3},
+	})
+	br.assertBitRuns(buf, 3, 39, []utils.SetBitRun{
+		{3, 3}, {9, 3}, {15, 3}, {21, 3}, {27, 3}, {33, 3},
+	})
+}
+
+func (br *BitSetRunReaderSuite) TestAllZeros() {
+	const bufferSize = 256
+	buf := make([]byte, int(bitutil.BytesForBits(bufferSize)))
+
+	for _, rg := range br.bufferTestRanges(buf) {
+		br.assertBitRuns(buf, rg.Offset, rg.Len, []utils.SetBitRun{})
+	}
+}
+
+func (br *BitSetRunReaderSuite) TestAllOnes() {
+	const bufferSize = 256
+	buf := make([]byte, int(bitutil.BytesForBits(bufferSize)))
+	utils.SetBitsTo(buf, 0, bufferSize, true)
+
+	for _, rg := range br.bufferTestRanges(buf) {
+		if rg.Len > 0 {
+			br.assertBitRuns(buf, rg.Offset, rg.Len, []utils.SetBitRun{{0, rg.Len}})
+		} else {
+			br.assertBitRuns(buf, rg.Offset, rg.Len, []utils.SetBitRun{})
+		}
+	}
+}
+
+func (br *BitSetRunReaderSuite) TestSmall() {
+	// ones then zeros then ones
+	const (
+		bufferSize      = 256
+		onesLen         = 64
+		secondOnesStart = bufferSize - onesLen
+	)
+
+	buf := make([]byte, int(bitutil.BytesForBits(bufferSize)))
+	utils.SetBitsTo(buf, 0, bufferSize, false)
+	utils.SetBitsTo(buf, 0, onesLen, true)
+	utils.SetBitsTo(buf, secondOnesStart, onesLen, true)
+
+	for _, rg := range br.bufferTestRanges(buf) {
+		expected := []utils.SetBitRun{}
+		if rg.Offset < onesLen && rg.Len > 0 {
+			expected = append(expected, utils.SetBitRun{0, utils.Min(onesLen-rg.Offset, rg.Len)})
+		}
+		if rg.Offset+rg.Len > secondOnesStart {
+			expected = append(expected, utils.SetBitRun{secondOnesStart - rg.Offset, rg.Len + rg.Offset - secondOnesStart})
+		}
+		br.assertBitRuns(buf, rg.Offset, rg.Len, expected)
+	}
+}
+
+func (br *BitSetRunReaderSuite) TestSingleRun() {
+	// one single run of ones, at varying places in the buffer
+	const bufferSize = 512
+	buf := make([]byte, int(bitutil.BytesForBits(bufferSize)))
+
+	for _, onesRg := range br.bufferTestRanges(buf) {
+		utils.SetBitsTo(buf, 0, bufferSize, false)
+		utils.SetBitsTo(buf, onesRg.Offset, onesRg.Len, true)
+
+		for _, rg := range br.bufferTestRanges(buf) {
+			expect := []utils.SetBitRun{}
+			if rg.Len != 0 && onesRg.Len != 0 && rg.Offset < onesRg.EndOffset() && onesRg.Offset < rg.EndOffset() {
+				// the two ranges intersect
+				var (
+					intersectStart = utils.Max(rg.Offset, onesRg.Offset)
+					intersectStop  = utils.Min(rg.EndOffset(), onesRg.EndOffset())
+				)
+				expect = append(expect, utils.SetBitRun{intersectStart - rg.Offset, intersectStop - intersectStart})
+			}
+			br.assertBitRuns(buf, rg.Offset, rg.Len, expect)
+		}
+	}
+}
diff --git a/go/parquet/internal/utils/bit_writer.go b/go/parquet/internal/utils/bit_writer.go
new file mode 100644
index 00000000000..7d523f61277
--- /dev/null
+++ b/go/parquet/internal/utils/bit_writer.go
@@ -0,0 +1,182 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"io"
+	"log"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+)
+
+// WriterAtBuffer is a convenience struct for providing a WriteAt function
+// to a byte slice for use with things that want an io.WriterAt
+type WriterAtBuffer struct {
+	buf []byte
+}
+
+// NewWriterAtBuffer returns an object which fulfills the io.WriterAt interface
+// by taking ownership of the passed in slice.
+func NewWriterAtBuffer(buf []byte) WriterAtWithLen {
+	return &WriterAtBuffer{buf}
+}
+
+// Len returns the length of the underlying byte slice.
+func (w *WriterAtBuffer) Len() int {
+	return len(w.buf)
+}
+
+// WriteAt fulfills the io.WriterAt interface to write len(p) bytes from p
+// to the underlying byte slice starting at offset off. It returns the number
+// of bytes written from p (0 <= n <= len(p)) and any error encountered.
+func (w *WriterAtBuffer) WriteAt(p []byte, off int64) (n int, err error) {
+	if off > int64(len(w.buf)) {
+		return 0, io.ErrUnexpectedEOF
+	}
+
+	n = copy(w.buf[off:], p)
+	if n < len(p) {
+		err = io.ErrUnexpectedEOF
+	}
+	return
+}
+
+// WriterAtWithLen is an interface for an io.WriterAt with a Len function
+type WriterAtWithLen interface {
+	io.WriterAt
+	Len() int
+}
+
+// BitWriter is a utility for writing values of specific bit widths to a stream
+// using a uint64 as a buffer to build up between flushing for efficiency.
+type BitWriter struct {
+	wr         io.WriterAt
+	buffer     uint64
+	byteoffset int
+	bitoffset  uint
+	raw        [8]byte
+}
+
+// NewBitWriter initializes a new bit writer to write to the passed in interface
+// using WriteAt to write the appropriate offsets and values.
+func NewBitWriter(w io.WriterAt) *BitWriter {
+	return &BitWriter{wr: w}
+}
+
+// ReserveBytes reserves the next aligned nbytes, skipping them and returning
+// the offset to use with WriteAt to write to those reserved bytes. Used for
+// RLE encoding to fill in the indicators after encoding.
+func (b *BitWriter) ReserveBytes(nbytes int) int {
+	b.Flush(true)
+	ret := b.byteoffset
+	b.byteoffset += nbytes
+	return ret
+}
+
+// WriteAt fulfills the io.WriterAt interface to write len(p) bytes from p
+// to the underlying byte slice starting at offset off. It returns the number
+// of bytes written from p (0 <= n <= len(p)) and any error encountered.
+// This allows writing full bytes directly to the underlying writer.
+func (b *BitWriter) WriteAt(val []byte, off int64) (int, error) {
+	return b.wr.WriteAt(val, off)
+}
+
+// Written returns the number of bytes that have been written to the BitWriter,
+// not how many bytes have been flushed. Use Flush to ensure that all data is flushed
+// to the underlying writer.
+func (b *BitWriter) Written() int {
+	return b.byteoffset + int(bitutil.BytesForBits(int64(b.bitoffset)))
+}
+
+// WriteValue writes the value v using nbits to pack it, returning false if it fails
+// for some reason.
+func (b *BitWriter) WriteValue(v uint64, nbits uint) error {
+	b.buffer |= v << b.bitoffset
+	b.bitoffset += nbits
+
+	if b.bitoffset >= 64 {
+		binary.LittleEndian.PutUint64(b.raw[:], b.buffer)
+		if _, err := b.wr.WriteAt(b.raw[:], int64(b.byteoffset)); err != nil {
+			return err
+		}
+		b.buffer = 0
+		b.byteoffset += 8
+		b.bitoffset -= 64
+		b.buffer = v >> (nbits - b.bitoffset)
+	}
+	return nil
+}
+
+// Flush will flush any buffered data to the underlying writer, pass true if
+// the next write should be byte-aligned after this flush.
+func (b *BitWriter) Flush(align bool) {
+	var nbytes int64
+	if b.bitoffset > 0 {
+		nbytes = bitutil.BytesForBits(int64(b.bitoffset))
+		binary.LittleEndian.PutUint64(b.raw[:], b.buffer)
+		b.wr.WriteAt(b.raw[:nbytes], int64(b.byteoffset))
+	}
+
+	if align {
+		b.buffer = 0
+		b.byteoffset += int(nbytes)
+		b.bitoffset = 0
+	}
+}
+
+// WriteAligned writes the value val as a little endian value in exactly nbytes
+// byte-aligned to the underlying writer, flushing via Flush(true) before writing nbytes
+// without buffering.
+func (b *BitWriter) WriteAligned(val uint64, nbytes int) bool {
+	b.Flush(true)
+	binary.LittleEndian.PutUint64(b.raw[:], val)
+	if _, err := b.wr.WriteAt(b.raw[:nbytes], int64(b.byteoffset)); err != nil {
+		log.Println(err)
+		return false
+	}
+	b.byteoffset += nbytes
+	return true
+}
+
+// WriteVlqInt writes v as a vlq encoded integer byte-aligned to the underlying writer
+// without buffering.
+func (b *BitWriter) WriteVlqInt(v uint64) bool {
+	b.Flush(true)
+	var buf [binary.MaxVarintLen64]byte
+	nbytes := binary.PutUvarint(buf[:], v)
+	if _, err := b.wr.WriteAt(buf[:nbytes], int64(b.byteoffset)); err != nil {
+		log.Println(err)
+		return false
+	}
+	b.byteoffset += nbytes
+	return true
+}
+
+// WriteZigZagVlqInt writes a zigzag encoded integer byte-aligned to the underlying writer
+// without buffering.
+func (b *BitWriter) WriteZigZagVlqInt(v int64) bool {
+	return b.WriteVlqInt(uint64((v << 1) ^ (v >> 63)))
+}
+
+// Clear resets the writer so that subsequent writes will start from offset 0,
+// allowing reuse of the underlying buffer and writer.
+func (b *BitWriter) Clear() {
+	b.byteoffset = 0
+	b.bitoffset = 0
+	b.buffer = 0
+}
diff --git a/go/parquet/internal/utils/bitmap_reader.go b/go/parquet/internal/utils/bitmap_reader.go
new file mode 100644
index 00000000000..ace63bb9b18
--- /dev/null
+++ b/go/parquet/internal/utils/bitmap_reader.go
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+// BitmapReader is a simple bitmap reader for a byte slice.
+type BitmapReader struct {
+	bitmap []byte
+	pos    int64
+	len    int64
+
+	current    byte
+	byteOffset int64
+	bitOffset  int64
+}
+
+// NewBitmapReader creates and returns a new bitmap reader for the given bitmap
+func NewBitmapReader(bitmap []byte, offset, length int64) *BitmapReader {
+	curbyte := byte(0)
+	if length > 0 && bitmap != nil {
+		curbyte = bitmap[offset/8]
+	}
+	return &BitmapReader{
+		bitmap:     bitmap,
+		byteOffset: offset / 8,
+		bitOffset:  offset % 8,
+		current:    curbyte,
+		len:        length,
+	}
+}
+
+// Set returns true if the current bit is set
+func (b *BitmapReader) Set() bool {
+	return (b.current & (1 << b.bitOffset)) != 0
+}
+
+// NotSet returns true if the current bit is not set
+func (b *BitmapReader) NotSet() bool {
+	return (b.current & (1 << b.bitOffset)) == 0
+}
+
+// Next advances the reader to the next bit in the bitmap.
+func (b *BitmapReader) Next() {
+	b.bitOffset++
+	b.pos++
+	if b.bitOffset == 8 {
+		b.bitOffset = 0
+		b.byteOffset++
+		if b.pos < b.len {
+			b.current = b.bitmap[int(b.byteOffset)]
+		}
+	}
+}
+
+// Pos returns the current bit position in the bitmap that the reader is looking at
+func (b *BitmapReader) Pos() int64 { return b.pos }
+
+// Len returns the total number of bits in the bitmap
+func (b *BitmapReader) Len() int64 { return b.len }
diff --git a/go/parquet/internal/utils/bitmap_reader_test.go b/go/parquet/internal/utils/bitmap_reader_test.go
new file mode 100644
index 00000000000..e9a7eda9d3f
--- /dev/null
+++ b/go/parquet/internal/utils/bitmap_reader_test.go
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+	"testing"
+
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/stretchr/testify/assert"
+)
+
+func assertReaderVals(t *testing.T, reader *utils.BitmapReader, vals []bool) {
+	for _, v := range vals {
+		if v {
+			assert.True(t, reader.Set())
+			assert.False(t, reader.NotSet())
+		} else {
+			assert.True(t, reader.NotSet())
+			assert.False(t, reader.Set())
+		}
+		reader.Next()
+	}
+}
+
+func TestNormalOperation(t *testing.T) {
+	for _, offset := range []int64{0, 1, 3, 5, 7, 8, 12, 13, 21, 38, 75, 120} {
+		buf := bitmapFromSlice([]int{0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1}, offset)
+
+		reader := utils.NewBitmapReader(buf, offset, 14)
+		assertReaderVals(t, reader, []bool{false, true, true, true, false, false, false, true, false, true, false, true, false, true})
+	}
+}
+
+func TestDoesNotReadOutOfBounds(t *testing.T) {
+	var bitmap [16]byte
+	const length = 128
+
+	reader := utils.NewBitmapReader(bitmap[:], 0, length)
+	assert.EqualValues(t, length, reader.Len())
+	assert.NotPanics(t, func() {
+		for i := 0; i < length; i++ {
+			assert.True(t, reader.NotSet())
+			reader.Next()
+		}
+	})
+	assert.EqualValues(t, length, reader.Pos())
+
+	reader = utils.NewBitmapReader(bitmap[:], 5, length-5)
+	assert.EqualValues(t, length-5, reader.Len())
+	assert.NotPanics(t, func() {
+		for i := 0; i < length-5; i++ {
+			assert.True(t, reader.NotSet())
+			reader.Next()
+		}
+	})
+	assert.EqualValues(t, length-5, reader.Pos())
+
+	assert.NotPanics(t, func() {
+		reader = utils.NewBitmapReader(nil, 0, 0)
+	})
+}
diff --git a/go/parquet/internal/utils/bitmap_writer.go b/go/parquet/internal/utils/bitmap_writer.go
new file mode 100644
index 00000000000..eed9f867554
--- /dev/null
+++ b/go/parquet/internal/utils/bitmap_writer.go
@@ -0,0 +1,277 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"math"
+	"math/bits"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/arrow/memory"
+)
+
+var (
+	// PrecedingBitmask is a convenience set of values as bitmasks for checking
+	// prefix bits of a byte
+	PrecedingBitmask = [8]byte{0, 1, 3, 7, 15, 31, 63, 127}
+	// TrailingBitmask is the bitwise complement version of kPrecedingBitmask
+	TrailingBitmask = [8]byte{255, 254, 252, 248, 240, 224, 192, 128}
+)
+
+// SetBitsTo is a convenience function to quickly set or unset all the bits
+// in a bitmap starting at startOffset for length bits.
+func SetBitsTo(bits []byte, startOffset, length int64, areSet bool) {
+	if length == 0 {
+		return
+	}
+
+	beg := startOffset
+	end := startOffset + length
+	var fill uint8 = 0
+	if areSet {
+		fill = math.MaxUint8
+	}
+
+	byteBeg := beg / 8
+	byteEnd := end/8 + 1
+
+	// don't modify bits before the startOffset by using this mask
+	firstByteMask := PrecedingBitmask[beg%8]
+	// don't modify bits past the length by using this mask
+	lastByteMask := TrailingBitmask[end%8]
+
+	if byteEnd == byteBeg+1 {
+		// set bits within a single byte
+		onlyByteMask := firstByteMask
+		if end%8 != 0 {
+			onlyByteMask = firstByteMask | lastByteMask
+		}
+
+		bits[byteBeg] &= onlyByteMask
+		bits[byteBeg] |= fill &^ onlyByteMask
+		return
+	}
+
+	// set/clear trailing bits of first byte
+	bits[byteBeg] &= firstByteMask
+	bits[byteBeg] |= fill &^ firstByteMask
+
+	if byteEnd-byteBeg > 2 {
+		memory.Set(bits[byteBeg+1:byteEnd-1], fill)
+	}
+
+	if end%8 == 0 {
+		return
+	}
+
+	bits[byteEnd-1] &= lastByteMask
+	bits[byteEnd-1] |= fill &^ lastByteMask
+}
+
+// BitmapWriter is an interface for bitmap writers so that we can use multiple
+// implementations or swap if necessary.
+type BitmapWriter interface {
+	// Set sets the current bit that will be written
+	Set()
+	// Clear clears the current bit that will be written
+	Clear()
+	// Next advances to the next bit for the writer
+	Next()
+	// Finish flushes the current byte out to the bitmap slice
+	Finish()
+	// AppendWord takes nbits from word which should be an LSB bitmap and appends them to the bitmap.
+	AppendWord(word uint64, nbits int64)
+	// Pos is the current position that will be written next
+	Pos() int64
+	// Reset allows reusing the bitmapwriter by resetting Pos to start with length as
+	// the number of bits that the writer can write.
+	Reset(start, length int64)
+}
+
+type bitmapWriter struct {
+	buf    []byte
+	pos    int64
+	length int64
+
+	curByte    uint8
+	bitMask    uint8
+	byteOffset int64
+}
+
+// NewBitmapWriter returns a sequential bitwise writer that preserves surrounding
+// bit values as it writes.
+func NewBitmapWriter(bitmap []byte, start, length int64) BitmapWriter {
+	ret := &bitmapWriter{
+		buf:        bitmap,
+		length:     length,
+		byteOffset: start / 8,
+		bitMask:    bitutil.BitMask[start%8],
+	}
+	if length > 0 {
+		ret.curByte = bitmap[int(ret.byteOffset)]
+	}
+	return ret
+}
+
+func (b *bitmapWriter) Reset(start, length int64) {
+	b.pos = 0
+	b.byteOffset = start / 8
+	b.bitMask = bitutil.BitMask[start%8]
+	b.length = length
+	if b.length > 0 {
+		b.curByte = b.buf[int(b.byteOffset)]
+	}
+}
+
+func (b *bitmapWriter) Pos() int64 { return b.pos }
+func (b *bitmapWriter) Set()       { b.curByte |= b.bitMask }
+func (b *bitmapWriter) Clear()     { b.curByte &= b.bitMask ^ 0xFF }
+
+func (b *bitmapWriter) Next() {
+	b.bitMask = b.bitMask << 1
+	b.pos++
+	if b.bitMask == 0 {
+		b.bitMask = 0x01
+		b.buf[b.byteOffset] = b.curByte
+		b.byteOffset++
+		if b.pos < b.length {
+			b.curByte = b.buf[int(b.byteOffset)]
+		}
+	}
+}
+
+func (b *bitmapWriter) Finish() {
+	if b.length > 0 && (b.bitMask != 0x01 || b.pos < b.length) {
+		b.buf[int(b.byteOffset)] = b.curByte
+	}
+}
+
+func (b *bitmapWriter) AppendWord(uint64, int64) {
+	panic("AppendWord not implemented")
+}
+
+type firstTimeBitmapWriter struct {
+	buf    []byte
+	pos    int64
+	length int64
+
+	curByte    uint8
+	bitMask    uint8
+	byteOffset int64
+}
+
+// NewFirstTimeBitmapWriter creates a bitmap writer that might clobber any bit values
+// following the bits written to the bitmap, as such it is faster than the bitmapwriter
+// that is created with NewBitmapWriter
+func NewFirstTimeBitmapWriter(buf []byte, start, length int64) BitmapWriter {
+	ret := &firstTimeBitmapWriter{
+		buf:        buf,
+		byteOffset: start / 8,
+		bitMask:    bitutil.BitMask[start%8],
+		length:     length,
+	}
+	if length > 0 {
+		ret.curByte = ret.buf[int(ret.byteOffset)] & PrecedingBitmask[start%8]
+	}
+	return ret
+}
+
+var endianBuffer [8]byte
+
+func (bw *firstTimeBitmapWriter) Reset(start, length int64) {
+	bw.pos = 0
+	bw.byteOffset = start / 8
+	bw.bitMask = bitutil.BitMask[start%8]
+	bw.length = length
+	if length > 0 {
+		bw.curByte = bw.buf[int(bw.byteOffset)] & PrecedingBitmask[start%8]
+	}
+}
+
+func (bw *firstTimeBitmapWriter) Pos() int64 { return bw.pos }
+func (bw *firstTimeBitmapWriter) AppendWord(word uint64, nbits int64) {
+	if nbits == 0 {
+		return
+	}
+
+	// location that the first byte needs to be written to for appending
+	appslice := bw.buf[int(bw.byteOffset):]
+
+	// update everything but curByte
+	bw.pos += nbits
+	bitOffset := bits.TrailingZeros32(uint32(bw.bitMask))
+	bw.bitMask = bitutil.BitMask[(int64(bitOffset)+nbits)%8]
+	bw.byteOffset += (int64(bitOffset) + nbits) / 8
+
+	if bitOffset != 0 {
+		// we're in the middle of the byte. Update the byte and shift bits appropriately
+		// so we can just copy the bytes.
+		carry := 8 - bitOffset
+		// Carry over bits from word to curByte. We assume any extra bits in word are unset
+		// so no additional accounting is needed for when nbits < carry
+		bw.curByte |= uint8((word & uint64(PrecedingBitmask[carry])) << bitOffset)
+		// check everything was transferred to curByte
+		if nbits < int64(carry) {
+			return
+		}
+		appslice[0] = bw.curByte
+		appslice = appslice[1:]
+		// move the carry bits off of word
+		word = word >> carry
+		nbits -= int64(carry)
+	}
+	bytesForWord := bitutil.BytesForBits(nbits)
+	binary.LittleEndian.PutUint64(endianBuffer[:], word)
+	copy(appslice, endianBuffer[:bytesForWord])
+
+	// at this point, the previous curByte has been written, the new curByte
+	// is either the last relevant byte in word or cleared if the new position
+	// is byte aligned (ie. a fresh byte)
+	if bw.bitMask == 0x1 {
+		bw.curByte = 0
+	} else {
+		bw.curByte = appslice[bytesForWord-1]
+	}
+}
+
+func (bw *firstTimeBitmapWriter) Set() {
+	bw.curByte |= bw.bitMask
+}
+
+func (bw *firstTimeBitmapWriter) Clear() {}
+
+func (bw *firstTimeBitmapWriter) Next() {
+	bw.bitMask = uint8(bw.bitMask << 1)
+	bw.pos++
+	if bw.bitMask == 0 {
+		// byte finished, advance to the next one
+		bw.bitMask = 0x1
+		bw.buf[int(bw.byteOffset)] = bw.curByte
+		bw.byteOffset++
+		bw.curByte = 0
+	}
+}
+
+func (bw *firstTimeBitmapWriter) Finish() {
+	// store curByte into the bitmap
+	if bw.length > 0 && bw.bitMask != 0x01 || bw.pos < bw.length {
+		bw.buf[int(bw.byteOffset)] = bw.curByte
+	}
+}
+
+func (bw *firstTimeBitmapWriter) Position() int64 { return bw.pos }
diff --git a/go/parquet/internal/utils/bitmap_writer_test.go b/go/parquet/internal/utils/bitmap_writer_test.go
new file mode 100644
index 00000000000..ec1218a0d6f
--- /dev/null
+++ b/go/parquet/internal/utils/bitmap_writer_test.go
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/parquet/internal/utils"
+	"github.com/stretchr/testify/suite"
+)
+
+func writeSliceToWriter(wr utils.BitmapWriter, values []int) {
+	for _, v := range values {
+		if v != 0 {
+			wr.Set()
+		} else {
+			wr.Clear()
+		}
+		wr.Next()
+	}
+	wr.Finish()
+}
+
+type FirstTimeBitmapWriterSuite struct {
+	suite.Suite
+}
+
+func (f *FirstTimeBitmapWriterSuite) TestNormalOperation() {
+	for _, fb := range []byte{0x00, 0xFF} {
+		{
+			bitmap := []byte{fb, fb, fb, fb}
+			wr := utils.NewFirstTimeBitmapWriter(bitmap, 0, 12)
+			writeSliceToWriter(wr, []int{0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1})
+			// {0b00110110, 0b1010, 0, 0}
+			f.Equal([]byte{0x36, 0x0a}, bitmap[:2])
+		}
+		{
+			bitmap := []byte{fb, fb, fb, fb}
+			wr := utils.NewFirstTimeBitmapWriter(bitmap, 4, 12)
+			writeSliceToWriter(wr, []int{0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1})
+			// {0b00110110, 0b1010, 0, 0}
+			f.Equal([]byte{0x60 | (fb & 0x0f), 0xa3}, bitmap[:2])
+		}
+		// Consecutive write chunks
+		{
+			bitmap := []byte{fb, fb, fb, fb}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 0, 6)
+				writeSliceToWriter(wr, []int{0, 1, 1, 0, 1, 1})
+			}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 6, 3)
+				writeSliceToWriter(wr, []int{0, 0, 0})
+			}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 9, 3)
+				writeSliceToWriter(wr, []int{1, 0, 1})
+			}
+			f.Equal([]byte{0x36, 0x0a}, bitmap[:2])
+		}
+		{
+			bitmap := []byte{fb, fb, fb, fb}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 4, 0)
+				writeSliceToWriter(wr, []int{})
+			}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 4, 6)
+				writeSliceToWriter(wr, []int{0, 1, 1, 0, 1, 1})
+			}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 10, 3)
+				writeSliceToWriter(wr, []int{0, 0, 0})
+			}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 13, 0)
+				writeSliceToWriter(wr, []int{})
+			}
+			{
+				wr := utils.NewFirstTimeBitmapWriter(bitmap, 13, 3)
+				writeSliceToWriter(wr, []int{1, 0, 1})
+			}
+			f.Equal([]byte{0x60 | (fb & 0x0f), 0xa3}, bitmap[:2])
+		}
+	}
+}
+
+func bitmapToString(bitmap []byte, bitCount int64) string {
+	var bld strings.Builder
+	bld.Grow(int(bitCount))
+	for i := 0; i < int(bitCount); i++ {
+		if bitutil.BitIsSet(bitmap, i) {
+			bld.WriteByte('1')
+		} else {
+			bld.WriteByte('0')
+		}
+	}
+	return bld.String()
+}
+
+func (f *FirstTimeBitmapWriterSuite) TestAppendWordOffsetOverwritesCorrectBits() {
+	check := func(start byte, expectedBits string, offset int64) {
+		validBits := []byte{start}
+		const bitsAfterAppend = 8
+		wr := utils.NewFirstTimeBitmapWriter(validBits, offset, int64(8*len(validBits))-offset)
+		wr.AppendWord(0xFF, bitsAfterAppend-offset)
+		wr.Finish()
+		f.Equal(expectedBits, bitmapToString(validBits, bitsAfterAppend))
+	}
+
+	f.Run("CheckAppend", func() {
+		tests := []struct {
+			expectedBits string
+			offset       int64
+		}{
+			{"11111111", 0},
+			{"01111111", 1},
+			{"00111111", 2},
+			{"00011111", 3},
+			{"00001111", 4},
+			{"00000111", 5},
+			{"00000011", 6},
+			{"00000001", 7},
+		}
+		for _, tt := range tests {
+			f.Run(tt.expectedBits, func() { check(0x00, tt.expectedBits, tt.offset) })
+		}
+	})
+
+	f.Run("CheckWithSet", func() {
+		tests := []struct {
+			expectedBits string
+			offset       int64
+		}{
+			{"11111111", 1},
+			{"10111111", 2},
+			{"10011111", 3},
+			{"10001111", 4},
+			{"10000111", 5},
+			{"10000011", 6},
+			{"10000001", 7},
+		}
+		for _, tt := range tests {
+			f.Run(tt.expectedBits, func() { check(0x1, tt.expectedBits, tt.offset) })
+		}
+	})
+
+	f.Run("CheckWithPreceding", func() {
+		tests := []struct {
+			expectedBits string
+			offset       int64
+		}{
+			{"11111111", 0},
+			{"11111111", 1},
+			{"11111111", 2},
+			{"11111111", 3},
+			{"11111111", 4},
+			{"11111111", 5},
+			{"11111111", 6},
+			{"11111111", 7},
+		}
+		for _, tt := range tests {
+			f.Run(fmt.Sprintf("%d", tt.offset), func() { check(0xFF, tt.expectedBits, tt.offset) })
+		}
+	})
+}
+
+func (f *FirstTimeBitmapWriterSuite) TestAppendZeroBitsNoImpact() {
+	validBits := []byte{0x00}
+	wr := utils.NewFirstTimeBitmapWriter(validBits, 1, int64(len(validBits)*8))
+	wr.AppendWord(0xFF, 0)
+	wr.AppendWord(0xFF, 0)
+	wr.AppendWord(0x01, 1)
+	wr.Finish()
+	f.Equal(uint8(0x2), validBits[0])
+}
+
+func (f *FirstTimeBitmapWriterSuite) TestAppendLessThanByte() {
+	{
+		validBits := make([]byte, 8)
+		wr := utils.NewFirstTimeBitmapWriter(validBits, 1, 8)
+		wr.AppendWord(0xB, 4)
+		wr.Finish()
+		f.Equal("01101000", bitmapToString(validBits, 8))
+	}
+	{
+		// test with all bits initially set
+		validBits := []byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
+		wr := utils.NewFirstTimeBitmapWriter(validBits, 1, 8)
+		wr.AppendWord(0xB, 4)
+		wr.Finish()
+		f.Equal("11101000", bitmapToString(validBits, 8))
+	}
+}
+
+func (f *FirstTimeBitmapWriterSuite) TestAppendByteThenMore() {
+	{
+		validBits := make([]byte, 8)
+		wr := utils.NewFirstTimeBitmapWriter(validBits, 0, 9)
+		wr.AppendWord(0xC3, 8)
+		wr.AppendWord(0x01, 1)
+		wr.Finish()
+		f.Equal("110000111", bitmapToString(validBits, 9))
+	}
+	{
+		// test with all bits initially set
+		validBits := []byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
+		wr := utils.NewFirstTimeBitmapWriter(validBits, 0, 9)
+		wr.AppendWord(0xC3, 8)
+		wr.AppendWord(0x01, 1)
+		wr.Finish()
+		f.Equal("110000111", bitmapToString(validBits, 9))
+	}
+}
+
+func (f *FirstTimeBitmapWriterSuite) TestAppendWordShiftBitsCorrectly() {
+	const pattern = 0x9A9A9A9A9A9A9A9A
+
+	tests := []struct {
+		leadingBits      string
+		middleBits       string
+		trailingBits     string
+		offset           int64
+		presetBufferBits bool
+	}{
+		{"01011001", "01011001", "00000000", 8, false},
+		{"00101100", "10101100", "10000000", 9, false},
+		{"00010110", "01010110", "01000000", 10, false},
+		{"00001011", "00101011", "00100000", 11, false},
+		{"00000101", "10010101", "10010000", 12, false},
+		{"00000010", "11001010", "11001000", 13, false},
+		{"00000001", "01100101", "01100100", 14, false},
+		{"00000000", "10110010", "10110010", 15, false},
+		{"01011001", "01011001", "11111111", 8, true},
+		{"10101100", "10101100", "10000000", 9, true},
+		{"11010110", "01010110", "01000000", 10, true},
+		{"11101011", "00101011", "00100000", 11, true},
+		{"11110101", "10010101", "10010000", 12, true},
+		{"11111010", "11001010", "11001000", 13, true},
+		{"11111101", "01100101", "01100100", 14, true},
+		{"11111110", "10110010", "10110010", 15, true},
+	}
+	for _, tt := range tests {
+		f.Run(tt.leadingBits, func() {
+			f.Require().GreaterOrEqual(tt.offset, int64(8))
+			validBits := make([]byte, 10)
+			if tt.presetBufferBits {
+				for idx := range validBits {
+					validBits[idx] = 0xFF
+				}
+			}
+
+			validBits[0] = 0x99
+			wr := utils.NewFirstTimeBitmapWriter(validBits, tt.offset, (9*int64(reflect.TypeOf(uint64(0)).Size()))-tt.offset)
+			wr.AppendWord(pattern, 64)
+			wr.Finish()
+			f.Equal(uint8(0x99), validBits[0])
+			f.Equal(tt.leadingBits, bitmapToString(validBits[1:], 8))
+			for x := 2; x < 9; x++ {
+				f.Equal(tt.middleBits, bitmapToString(validBits[x:], 8))
+			}
+			f.Equal(tt.trailingBits, bitmapToString(validBits[9:], 8))
+		})
+	}
+}
+
+func (f *FirstTimeBitmapWriterSuite) TestAppendWordOnlyAppropriateBytesWritten() {
+	validBits := []byte{0x00, 0x00}
+	bitmap := uint64(0x1FF)
+	{
+		wr := utils.NewFirstTimeBitmapWriter(validBits, 1, int64(8*len(validBits))-1)
+		wr.AppendWord(bitmap, 7)
+		wr.Finish()
+		f.Equal([]byte{0xFE, 0x00}, validBits)
+	}
+	{
+		wr := utils.NewFirstTimeBitmapWriter(validBits, 1, int64(8*len(validBits)-1))
+		wr.AppendWord(bitmap, 8)
+		wr.Finish()
+		f.Equal([]byte{0xFE, 0x03}, validBits)
+	}
+}
+
+func TestFirstTimeBitmapWriter(t *testing.T) {
+	suite.Run(t, new(FirstTimeBitmapWriterSuite))
+}
diff --git a/go/parquet/internal/utils/clib_amd64.s b/go/parquet/internal/utils/clib_amd64.s
new file mode 100644
index 00000000000..07258bd2f96
--- /dev/null
+++ b/go/parquet/internal/utils/clib_amd64.s
@@ -0,0 +1,87 @@
+#include "textflag.h"
+
+// void *memcpy(void *dst, const void *src, size_t n)
+// DI = dst, SI = src, DX = size
+TEXT clib·_memcpy(SB), $16-0
+	PUSHQ R8
+	PUSHQ CX
+	XORQ  CX, CX // clear register
+
+MEMCPY_QUAD_LOOP:
+	ADDQ $8, CX
+	CMPQ CX, DX
+	JA   MEMCPY_QUAD_DONE
+	MOVQ -8(SI)(CX*1), R8
+	MOVQ R8, -8(DI)(CX*1)
+	JMP  MEMCPY_QUAD_LOOP
+
+MEMCPY_QUAD_DONE:
+	SUBQ $4, CX
+	CMPQ CX, DX
+	JA   MEMCPY_LONG_DONE
+	MOVL -4(SI)(CX*1), R8
+	MOVL R8, -4(DI)(CX*1)
+	ADDQ $4, CX
+
+MEMCPY_LONG_DONE:
+	SUBQ $2, CX
+	CMPQ CX, DX
+	JA   MEMCPY_WORD_DONE
+	MOVW -2(SI)(CX*1), R8
+	MOVW R8, -2(DI)(CX*1)
+	ADDQ $2, CX
+
+MEMCPY_WORD_DONE:
+	SUBQ $1, CX
+	CMPQ CX, DX
+	JA   MEMCPY_BYTE_DONE
+	MOVB -1(SI)(CX*1), R8
+	MOVB R8, -1(DI)(CX*1)
+
+MEMCPY_BYTE_DONE:
+	MOVQ DI, AX // set return value
+	POPQ CX
+	POPQ R8
+	RET
+
+// void *memset(void *str, int c, size_t n)
+// DI = str, SI = c, DX = size
+TEXT clib·_memset(SB), $16-0
+	PUSHQ CX
+    LONG $0x0101f669; WORD $0x0101 // imul esi, 0x1010101
+    MOVQ SI, CX
+    ROLQ $32, CX
+    ORQ CX, SI
+	XORQ CX, CX // clear register
+
+MEMSET_QUAD_LOOP:
+	ADDQ $8, CX
+	CMPQ CX, DX
+	JA   MEMSET_QUAD_DONE
+	MOVQ SI, -8(DI)(CX*1)
+	JMP  MEMSET_QUAD_LOOP
+
+MEMSET_QUAD_DONE:
+	SUBQ $4, CX
+	CMPQ CX, DX
+	JA   MEMSET_LONG_DONE
+	MOVL SI, -4(DI)(CX*1)
+	ADDQ $4, CX
+
+MEMSET_LONG_DONE:
+	SUBQ $2, CX
+	CMPQ CX, DX
+	JA   MEMSET_WORD_DONE
+	MOVW SI, -2(DI)(CX*1)
+	ADDQ $2, CX
+
+MEMSET_WORD_DONE:
+	SUBQ $1, CX
+	CMPQ CX, DX
+	JA   MEMSET_BYTE_DONE
+	MOVB SI, -1(DI)(CX*1)
+
+MEMSET_BYTE_DONE:
+	MOVQ DI, AX // set return value
+	POPQ CX
+	RET
diff --git a/go/parquet/internal/utils/dictionary.go b/go/parquet/internal/utils/dictionary.go
new file mode 100644
index 00000000000..4d5ef13fbf3
--- /dev/null
+++ b/go/parquet/internal/utils/dictionary.go
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"math"
+	"reflect"
+)
+
+// IndexType is the type we're going to use for Dictionary indexes, currently
+// an alias to int32
+type IndexType = int32
+
+// Max and Min constants for the IndexType
+const (
+	MaxIndexType = math.MaxInt32
+	MinIndexType = math.MinInt32
+)
+
+// DictionaryConverter is an interface used for dealing with RLE decoding and encoding
+// when working with dictionaries to get values from indexes.
+type DictionaryConverter interface {
+	// Copy takes an interface{} which must be a slice of the appropriate type, and will be populated
+	// by the dictionary values at the indexes from the IndexType slice
+	Copy(interface{}, []IndexType) error
+	// Fill fills interface{} which must be a slice of the appropriate type, with the value
+	// specified by the dictionary index passed in.
+	Fill(interface{}, IndexType) error
+	// FillZero fills interface{}, which must be a slice of the appropriate type, with the zero value
+	// for the given type.
+	FillZero(interface{})
+	// IsValid validates that all of the indexes passed in are valid indexes for the dictionary
+	IsValid(...IndexType) bool
+}
+
+// converter for getspaced that handles runs that get returned directly
+// as output, rather than using a dictionary
+type plainConverter struct{}
+
+func (plainConverter) IsValid(...IndexType) bool { return true }
+func (plainConverter) Fill(values interface{}, val IndexType) error {
+	v := reflect.ValueOf(values)
+	switch v.Type().Elem().Kind() {
+	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
+		v.Index(0).SetInt(int64(val))
+	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
+		v.Index(0).SetUint(uint64(val))
+	}
+
+	for i := 1; i < v.Len(); i *= 2 {
+		reflect.Copy(v.Slice(i, v.Len()), v.Slice(0, i))
+	}
+	return nil
+}
+
+func (plainConverter) FillZero(values interface{}) {
+	v := reflect.ValueOf(values)
+	zeroVal := reflect.New(v.Type().Elem()).Elem()
+
+	v.Index(0).Set(zeroVal)
+	for i := 1; i < v.Len(); i *= 2 {
+		reflect.Copy(v.Slice(i, v.Len()), v.Slice(0, i))
+	}
+}
+
+func (plainConverter) Copy(out interface{}, values []IndexType) error {
+	vout := reflect.ValueOf(out)
+	vin := reflect.ValueOf(values)
+	for i := 0; i < vin.Len(); i++ {
+		vout.Index(i).Set(vin.Index(i).Convert(vout.Type().Elem()))
+	}
+	return nil
+}
diff --git a/go/parquet/internal/utils/math.go b/go/parquet/internal/utils/math.go
new file mode 100644
index 00000000000..62cf96ce431
--- /dev/null
+++ b/go/parquet/internal/utils/math.go
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+// Min is a convenience Min function for int64
+func Min(a, b int64) int64 {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// MinInt is a convenience Min function for int
+func MinInt(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// Max is a convenience Max function for int64
+func Max(a, b int64) int64 {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// MaxInt is a convenience Max function for int
+func MaxInt(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
diff --git a/go/parquet/internal/utils/min_max.go b/go/parquet/internal/utils/min_max.go
new file mode 100644
index 00000000000..f84cfcbbe0a
--- /dev/null
+++ b/go/parquet/internal/utils/min_max.go
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"math"
+)
+
+// this file contains pure go implementations of the min_max functions that are
+// SIMD accelerated so that we can fallback to these if the cpu doesn't support
+// AVX2 or SSE4 instructions.
+
+func int32MinMax(values []int32) (min, max int32) {
+	min = math.MaxInt32
+	max = math.MinInt32
+
+	for _, v := range values {
+		if min > v {
+			min = v
+		}
+		if max < v {
+			max = v
+		}
+	}
+	return
+}
+
+func uint32MinMax(values []uint32) (min, max uint32) {
+	min = math.MaxUint32
+	max = 0
+
+	for _, v := range values {
+		if min > v {
+			min = v
+		}
+		if max < v {
+			max = v
+		}
+	}
+	return
+}
+
+func int64MinMax(values []int64) (min, max int64) {
+	min = math.MaxInt64
+	max = math.MinInt64
+
+	for _, v := range values {
+		if min > v {
+			min = v
+		}
+		if max < v {
+			max = v
+		}
+	}
+	return
+}
+
+func uint64MinMax(values []uint64) (min, max uint64) {
+	min = math.MaxUint64
+	max = 0
+
+	for _, v := range values {
+		if min > v {
+			min = v
+		}
+		if max < v {
+			max = v
+		}
+	}
+	return
+}
+
+var minmaxFuncs = struct {
+	i32  func([]int32) (int32, int32)
+	ui32 func([]uint32) (uint32, uint32)
+	i64  func([]int64) (int64, int64)
+	ui64 func([]uint64) (uint64, uint64)
+}{}
+
+// GetMinMaxInt32 returns the min and max for a int32 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxInt32(v []int32) (min, max int32) {
+	return minmaxFuncs.i32(v)
+}
+
+// GetMinMaxUint32 returns the min and max for a uint32 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxUint32(v []uint32) (min, max uint32) {
+	return minmaxFuncs.ui32(v)
+}
+
+// GetMinMaxInt64 returns the min and max for a int64 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxInt64(v []int64) (min, max int64) {
+	return minmaxFuncs.i64(v)
+}
+
+// GetMinMaxUint64 returns the min and max for a uint64 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxUint64(v []uint64) (min, max uint64) {
+	return minmaxFuncs.ui64(v)
+}
diff --git a/go/parquet/internal/utils/min_max_amd64.go b/go/parquet/internal/utils/min_max_amd64.go
new file mode 100644
index 00000000000..37da5612781
--- /dev/null
+++ b/go/parquet/internal/utils/min_max_amd64.go
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import "golang.org/x/sys/cpu"
+
+func init() {
+	// if the CPU supports AVX2 or SSE4 then let's use those to benefit from SIMD
+	// to accelerate the performance for finding the min and max for an integral slice.
+	// otherwise fallback to a pure go implementation if the cpu doesn't have these features.
+	if cpu.X86.HasAVX2 {
+		minmaxFuncs.i32 = int32MaxMinAVX2
+		minmaxFuncs.ui32 = uint32MaxMinAVX2
+		minmaxFuncs.i64 = int64MaxMinAVX2
+		minmaxFuncs.ui64 = uint64MaxMinAVX2
+	} else if cpu.X86.HasSSE42 {
+		minmaxFuncs.i32 = int32MaxMinSSE4
+		minmaxFuncs.ui32 = uint32MaxMinSSE4
+		minmaxFuncs.i64 = int64MaxMinSSE4
+		minmaxFuncs.ui64 = uint64MaxMinSSE4
+	} else {
+		minmaxFuncs.i32 = int32MinMax
+		minmaxFuncs.ui32 = uint32MinMax
+		minmaxFuncs.i64 = int64MinMax
+		minmaxFuncs.ui64 = uint64MinMax
+	}
+}
diff --git a/go/parquet/internal/utils/min_max_avx2.go b/go/parquet/internal/utils/min_max_avx2.go
new file mode 100644
index 00000000000..d7e1f924ace
--- /dev/null
+++ b/go/parquet/internal/utils/min_max_avx2.go
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import (
+	"unsafe"
+)
+
+// This file contains convenience functions for utilizing AVX2 intrinsics to quickly
+// and efficiently get the min and max from an integral slice.
+
+//go:noescape
+func _int32_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int32MaxMinAVX2(values []int32) (min, max int32) {
+	_int32_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
+
+//go:noescape
+func _uint32_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint32MaxMinAVX2(values []uint32) (min, max uint32) {
+	_uint32_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
+
+//go:noescape
+func _int64_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int64MaxMinAVX2(values []int64) (min, max int64) {
+	_int64_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
+
+//go:noescape
+func _uint64_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint64MaxMinAVX2(values []uint64) (min, max uint64) {
+	_uint64_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
diff --git a/go/parquet/internal/utils/min_max_avx2.s b/go/parquet/internal/utils/min_max_avx2.s
new file mode 100644
index 00000000000..6a1bb18fde6
--- /dev/null
+++ b/go/parquet/internal/utils/min_max_avx2.s
@@ -0,0 +1,1352 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+DATA LCDATA1<>+0x000(SB)/8, $0x7fffffff80000000
+GLOBL LCDATA1<>(SB), 8, $8
+
+TEXT ·_int32_max_min_avx2(SB), $72-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+	ADDQ $8, SP
+	LEAQ LCDATA1<>(SB), BP
+
+	WORD $0xf685                   // test    esi, esi
+	JLE  LBB0_1
+	WORD $0x8941; BYTE $0xf0       // mov    r8d, esi
+	WORD $0xfe83; BYTE $0x1f       // cmp    esi, 31
+	JA   LBB0_6
+	LONG $0x000000b8; BYTE $0x80   // mov    eax, -2147483648
+	LONG $0xffffb941; WORD $0x7fff // mov    r9d, 2147483647
+	WORD $0x3145; BYTE $0xdb       // xor    r11d, r11d
+	JMP  LBB0_4
+
+LBB0_1:
+	LONG $0xffffb941; WORD $0x7fff // mov    r9d, 2147483647
+	LONG $0x000000b8; BYTE $0x80   // mov    eax, -2147483648
+	JMP  LBB0_14
+
+LBB0_6:
+	WORD $0x8945; BYTE $0xc3       // mov    r11d, r8d
+	LONG $0xe0e38341               // and    r11d, -32
+	LONG $0xe0438d49               // lea    rax, [r11 - 32]
+	WORD $0x8949; BYTE $0xc2       // mov    r10, rax
+	LONG $0x05eac149               // shr    r10, 5
+	LONG $0x01c28349               // add    r10, 1
+	WORD $0x8945; BYTE $0xd1       // mov    r9d, r10d
+	LONG $0x03e18341               // and    r9d, 3
+	LONG $0x60f88348               // cmp    rax, 96
+	JAE  LBB0_8
+	LONG $0x587de2c4; WORD $0x0045 // vpbroadcastd    ymm0, dword 0[rbp] /* [rip + .LCPI0_0] */
+	LONG $0x587de2c4; WORD $0x044d // vpbroadcastd    ymm1, dword 4[rbp] /* [rip + .LCPI0_1] */
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0xd16ffdc5               // vmovdqa    ymm2, ymm1
+	LONG $0xe16ffdc5               // vmovdqa    ymm4, ymm1
+	LONG $0xf16ffdc5               // vmovdqa    ymm6, ymm1
+	LONG $0xd86ffdc5               // vmovdqa    ymm3, ymm0
+	LONG $0xe86ffdc5               // vmovdqa    ymm5, ymm0
+	LONG $0xf86ffdc5               // vmovdqa    ymm7, ymm0
+	JMP  LBB0_10
+
+LBB0_8:
+	LONG $0xfce28349               // and    r10, -4
+	LONG $0x587de2c4; WORD $0x0045 // vpbroadcastd    ymm0, dword 0[rbp] /* [rip + .LCPI0_0] */
+	WORD $0xf749; BYTE $0xda       // neg    r10
+	LONG $0x587de2c4; WORD $0x044d // vpbroadcastd    ymm1, dword 4[rbp] /* [rip + .LCPI0_1] */
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0xd16ffdc5               // vmovdqa    ymm2, ymm1
+	LONG $0xe16ffdc5               // vmovdqa    ymm4, ymm1
+	LONG $0xf16ffdc5               // vmovdqa    ymm6, ymm1
+	LONG $0xd86ffdc5               // vmovdqa    ymm3, ymm0
+	LONG $0xe86ffdc5               // vmovdqa    ymm5, ymm0
+	LONG $0xf86ffdc5               // vmovdqa    ymm7, ymm0
+
+LBB0_9:
+	LONG $0x046f7ec5; BYTE $0x87         // vmovdqu    ymm8, yword [rdi + 4*rax]
+	LONG $0x4c6f7ec5; WORD $0x2087       // vmovdqu    ymm9, yword [rdi + 4*rax + 32]
+	LONG $0x546f7ec5; WORD $0x4087       // vmovdqu    ymm10, yword [rdi + 4*rax + 64]
+	LONG $0x5c6f7ec5; WORD $0x6087       // vmovdqu    ymm11, yword [rdi + 4*rax + 96]
+	LONG $0x394dc2c4; BYTE $0xf3         // vpminsd    ymm6, ymm6, ymm11
+	LONG $0x395dc2c4; BYTE $0xe2         // vpminsd    ymm4, ymm4, ymm10
+	LONG $0x3975c2c4; BYTE $0xc8         // vpminsd    ymm1, ymm1, ymm8
+	LONG $0x396dc2c4; BYTE $0xd1         // vpminsd    ymm2, ymm2, ymm9
+	LONG $0x3d45c2c4; BYTE $0xfb         // vpmaxsd    ymm7, ymm7, ymm11
+	LONG $0x3d55c2c4; BYTE $0xea         // vpmaxsd    ymm5, ymm5, ymm10
+	LONG $0x3d7dc2c4; BYTE $0xc0         // vpmaxsd    ymm0, ymm0, ymm8
+	LONG $0x3d65c2c4; BYTE $0xd9         // vpmaxsd    ymm3, ymm3, ymm9
+	QUAD $0x0000e087846f7ec5; BYTE $0x00 // vmovdqu    ymm8, yword [rdi + 4*rax + 224]
+	QUAD $0x0000c0878c6f7ec5; BYTE $0x00 // vmovdqu    ymm9, yword [rdi + 4*rax + 192]
+	QUAD $0x00008087946f7ec5; BYTE $0x00 // vmovdqu    ymm10, yword [rdi + 4*rax + 128]
+	QUAD $0x0000a0879c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 4*rax + 160]
+	QUAD $0x00010087a46f7ec5; BYTE $0x00 // vmovdqu    ymm12, yword [rdi + 4*rax + 256]
+	QUAD $0x00014087ac6f7ec5; BYTE $0x00 // vmovdqu    ymm13, yword [rdi + 4*rax + 320]
+	QUAD $0x00016087b46f7ec5; BYTE $0x00 // vmovdqu    ymm14, yword [rdi + 4*rax + 352]
+	LONG $0x393d42c4; BYTE $0xfe         // vpminsd    ymm15, ymm8, ymm14
+	LONG $0x394dc2c4; BYTE $0xf7         // vpminsd    ymm6, ymm6, ymm15
+	LONG $0x347ffdc5; BYTE $0x24         // vmovdqa    yword [rsp], ymm6
+	LONG $0x393542c4; BYTE $0xfd         // vpminsd    ymm15, ymm9, ymm13
+	LONG $0x395dc2c4; BYTE $0xe7         // vpminsd    ymm4, ymm4, ymm15
+	LONG $0x392d42c4; BYTE $0xfc         // vpminsd    ymm15, ymm10, ymm12
+	LONG $0x3975c2c4; BYTE $0xcf         // vpminsd    ymm1, ymm1, ymm15
+	QUAD $0x00012087bc6f7ec5; BYTE $0x00 // vmovdqu    ymm15, yword [rdi + 4*rax + 288]
+	LONG $0x3925c2c4; BYTE $0xf7         // vpminsd    ymm6, ymm11, ymm15
+	LONG $0x396de2c4; BYTE $0xd6         // vpminsd    ymm2, ymm2, ymm6
+	LONG $0x3d3dc2c4; BYTE $0xf6         // vpmaxsd    ymm6, ymm8, ymm14
+	LONG $0x3d45e2c4; BYTE $0xfe         // vpmaxsd    ymm7, ymm7, ymm6
+	LONG $0x3d35c2c4; BYTE $0xf5         // vpmaxsd    ymm6, ymm9, ymm13
+	LONG $0x3d55e2c4; BYTE $0xee         // vpmaxsd    ymm5, ymm5, ymm6
+	LONG $0x3d2dc2c4; BYTE $0xf4         // vpmaxsd    ymm6, ymm10, ymm12
+	LONG $0x3d7de2c4; BYTE $0xc6         // vpmaxsd    ymm0, ymm0, ymm6
+	LONG $0x3d25c2c4; BYTE $0xf7         // vpmaxsd    ymm6, ymm11, ymm15
+	LONG $0x3d65e2c4; BYTE $0xde         // vpmaxsd    ymm3, ymm3, ymm6
+	QUAD $0x0001a087b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 4*rax + 416]
+	LONG $0x396de2c4; BYTE $0xd6         // vpminsd    ymm2, ymm2, ymm6
+	LONG $0x3d65e2c4; BYTE $0xde         // vpmaxsd    ymm3, ymm3, ymm6
+	QUAD $0x00018087b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 4*rax + 384]
+	LONG $0x3975e2c4; BYTE $0xce         // vpminsd    ymm1, ymm1, ymm6
+	LONG $0x3d7de2c4; BYTE $0xc6         // vpmaxsd    ymm0, ymm0, ymm6
+	QUAD $0x0001c087b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 4*rax + 448]
+	LONG $0x395de2c4; BYTE $0xe6         // vpminsd    ymm4, ymm4, ymm6
+	LONG $0x3d55e2c4; BYTE $0xee         // vpmaxsd    ymm5, ymm5, ymm6
+	QUAD $0x0001e087846f7ec5; BYTE $0x00 // vmovdqu    ymm8, yword [rdi + 4*rax + 480]
+	LONG $0x393de2c4; WORD $0x2434       // vpminsd    ymm6, ymm8, yword [rsp]
+	LONG $0x3d45c2c4; BYTE $0xf8         // vpmaxsd    ymm7, ymm7, ymm8
+	LONG $0x80e88348                     // sub    rax, -128
+	LONG $0x04c28349                     // add    r10, 4
+	JNE  LBB0_9
+
+LBB0_10:
+	WORD $0x854d; BYTE $0xc9 // test    r9, r9
+	JE   LBB0_13
+	LONG $0x87048d48         // lea    rax, [rdi + 4*rax]
+	WORD $0xf749; BYTE $0xd9 // neg    r9
+
+LBB0_12:
+	LONG $0x006f7ec5             // vmovdqu    ymm8, yword [rax]
+	LONG $0x486f7ec5; BYTE $0x20 // vmovdqu    ymm9, yword [rax + 32]
+	LONG $0x506f7ec5; BYTE $0x40 // vmovdqu    ymm10, yword [rax + 64]
+	LONG $0x586f7ec5; BYTE $0x60 // vmovdqu    ymm11, yword [rax + 96]
+	LONG $0x396dc2c4; BYTE $0xd1 // vpminsd    ymm2, ymm2, ymm9
+	LONG $0x3975c2c4; BYTE $0xc8 // vpminsd    ymm1, ymm1, ymm8
+	LONG $0x395dc2c4; BYTE $0xe2 // vpminsd    ymm4, ymm4, ymm10
+	LONG $0x394dc2c4; BYTE $0xf3 // vpminsd    ymm6, ymm6, ymm11
+	LONG $0x3d65c2c4; BYTE $0xd9 // vpmaxsd    ymm3, ymm3, ymm9
+	LONG $0x3d7dc2c4; BYTE $0xc0 // vpmaxsd    ymm0, ymm0, ymm8
+	LONG $0x3d55c2c4; BYTE $0xea // vpmaxsd    ymm5, ymm5, ymm10
+	LONG $0x3d45c2c4; BYTE $0xfb // vpmaxsd    ymm7, ymm7, ymm11
+	LONG $0x80e88348             // sub    rax, -128
+	WORD $0xff49; BYTE $0xc1     // inc    r9
+	JNE  LBB0_12
+
+LBB0_13:
+	LONG $0x396de2c4; BYTE $0xd6   // vpminsd    ymm2, ymm2, ymm6
+	LONG $0x3975e2c4; BYTE $0xcc   // vpminsd    ymm1, ymm1, ymm4
+	LONG $0x3975e2c4; BYTE $0xca   // vpminsd    ymm1, ymm1, ymm2
+	LONG $0x3d65e2c4; BYTE $0xd7   // vpmaxsd    ymm2, ymm3, ymm7
+	LONG $0x3d7de2c4; BYTE $0xc5   // vpmaxsd    ymm0, ymm0, ymm5
+	LONG $0x3d7de2c4; BYTE $0xc2   // vpmaxsd    ymm0, ymm0, ymm2
+	LONG $0x397de3c4; WORD $0x01c2 // vextracti128    xmm2, ymm0, 1
+	LONG $0x3d79e2c4; BYTE $0xc2   // vpmaxsd    xmm0, xmm0, xmm2
+	LONG $0xd070f9c5; BYTE $0x4e   // vpshufd    xmm2, xmm0, 78
+	LONG $0x3d79e2c4; BYTE $0xc2   // vpmaxsd    xmm0, xmm0, xmm2
+	LONG $0xd070f9c5; BYTE $0xe5   // vpshufd    xmm2, xmm0, 229
+	LONG $0x3d79e2c4; BYTE $0xc2   // vpmaxsd    xmm0, xmm0, xmm2
+	LONG $0xc07ef9c5               // vmovd    eax, xmm0
+	LONG $0x397de3c4; WORD $0x01c8 // vextracti128    xmm0, ymm1, 1
+	LONG $0x3971e2c4; BYTE $0xc0   // vpminsd    xmm0, xmm1, xmm0
+	LONG $0xc870f9c5; BYTE $0x4e   // vpshufd    xmm1, xmm0, 78
+	LONG $0x3979e2c4; BYTE $0xc1   // vpminsd    xmm0, xmm0, xmm1
+	LONG $0xc870f9c5; BYTE $0xe5   // vpshufd    xmm1, xmm0, 229
+	LONG $0x3979e2c4; BYTE $0xc1   // vpminsd    xmm0, xmm0, xmm1
+	LONG $0x7e79c1c4; BYTE $0xc1   // vmovd    r9d, xmm0
+	WORD $0x394d; BYTE $0xc3       // cmp    r11, r8
+	JE   LBB0_14
+
+LBB0_4:
+	WORD $0xc689 // mov    esi, eax
+
+LBB0_5:
+	LONG $0x9f048b42         // mov    eax, dword [rdi + 4*r11]
+	WORD $0x3941; BYTE $0xc1 // cmp    r9d, eax
+	LONG $0xc84f0f44         // cmovg    r9d, eax
+	WORD $0xc639             // cmp    esi, eax
+	WORD $0x4d0f; BYTE $0xc6 // cmovge    eax, esi
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0xc689             // mov    esi, eax
+	WORD $0x394d; BYTE $0xd8 // cmp    r8, r11
+	JNE  LBB0_5
+
+LBB0_14:
+	WORD $0x0189             // mov    dword [rcx], eax
+	WORD $0x8944; BYTE $0x0a // mov    dword [rdx], r9d
+	SUBQ $8, SP
+	VZEROUPPER
+	RET
+
+TEXT ·_uint32_max_min_avx2(SB), $72-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+	ADDQ $8, SP
+
+	WORD $0xf685                   // test    esi, esi
+	JLE  LBB1_1
+	WORD $0x8941; BYTE $0xf0       // mov    r8d, esi
+	WORD $0xfe83; BYTE $0x1f       // cmp    esi, 31
+	JA   LBB1_6
+	WORD $0x3145; BYTE $0xdb       // xor    r11d, r11d
+	LONG $0xffffb941; WORD $0xffff // mov    r9d, -1
+	WORD $0xf631                   // xor    esi, esi
+	JMP  LBB1_4
+
+LBB1_1:
+	LONG $0xffffb941; WORD $0xffff // mov    r9d, -1
+	WORD $0xf631                   // xor    esi, esi
+	JMP  LBB1_14
+
+LBB1_6:
+	WORD $0x8945; BYTE $0xc3 // mov    r11d, r8d
+	LONG $0xe0e38341         // and    r11d, -32
+	LONG $0xe0438d49         // lea    rax, [r11 - 32]
+	WORD $0x8949; BYTE $0xc2 // mov    r10, rax
+	LONG $0x05eac149         // shr    r10, 5
+	LONG $0x01c28349         // add    r10, 1
+	WORD $0x8945; BYTE $0xd1 // mov    r9d, r10d
+	LONG $0x03e18341         // and    r9d, 3
+	LONG $0x60f88348         // cmp    rax, 96
+	JAE  LBB1_8
+	LONG $0xc0eff9c5         // vpxor    xmm0, xmm0, xmm0
+	LONG $0xc976f5c5         // vpcmpeqd    ymm1, ymm1, ymm1
+	WORD $0xc031             // xor    eax, eax
+	LONG $0xd276edc5         // vpcmpeqd    ymm2, ymm2, ymm2
+	LONG $0xe476ddc5         // vpcmpeqd    ymm4, ymm4, ymm4
+	LONG $0xf676cdc5         // vpcmpeqd    ymm6, ymm6, ymm6
+	LONG $0xdbefe1c5         // vpxor    xmm3, xmm3, xmm3
+	LONG $0xedefd1c5         // vpxor    xmm5, xmm5, xmm5
+	LONG $0xffefc1c5         // vpxor    xmm7, xmm7, xmm7
+	JMP  LBB1_10
+
+LBB1_8:
+	LONG $0xfce28349         // and    r10, -4
+	WORD $0xf749; BYTE $0xda // neg    r10
+	LONG $0xc0eff9c5         // vpxor    xmm0, xmm0, xmm0
+	LONG $0xc976f5c5         // vpcmpeqd    ymm1, ymm1, ymm1
+	WORD $0xc031             // xor    eax, eax
+	LONG $0xd276edc5         // vpcmpeqd    ymm2, ymm2, ymm2
+	LONG $0xe476ddc5         // vpcmpeqd    ymm4, ymm4, ymm4
+	LONG $0xf676cdc5         // vpcmpeqd    ymm6, ymm6, ymm6
+	LONG $0xdbefe1c5         // vpxor    xmm3, xmm3, xmm3
+	LONG $0xedefd1c5         // vpxor    xmm5, xmm5, xmm5
+	LONG $0xffefc1c5         // vpxor    xmm7, xmm7, xmm7
+
+LBB1_9:
+	LONG $0x046f7ec5; BYTE $0x87         // vmovdqu    ymm8, yword [rdi + 4*rax]
+	LONG $0x4c6f7ec5; WORD $0x2087       // vmovdqu    ymm9, yword [rdi + 4*rax + 32]
+	LONG $0x546f7ec5; WORD $0x4087       // vmovdqu    ymm10, yword [rdi + 4*rax + 64]
+	LONG $0x5c6f7ec5; WORD $0x6087       // vmovdqu    ymm11, yword [rdi + 4*rax + 96]
+	LONG $0x3b4dc2c4; BYTE $0xf3         // vpminud    ymm6, ymm6, ymm11
+	LONG $0x3b5dc2c4; BYTE $0xe2         // vpminud    ymm4, ymm4, ymm10
+	LONG $0x3b75c2c4; BYTE $0xc8         // vpminud    ymm1, ymm1, ymm8
+	LONG $0x3b6dc2c4; BYTE $0xd1         // vpminud    ymm2, ymm2, ymm9
+	LONG $0x3f45c2c4; BYTE $0xfb         // vpmaxud    ymm7, ymm7, ymm11
+	LONG $0x3f55c2c4; BYTE $0xea         // vpmaxud    ymm5, ymm5, ymm10
+	LONG $0x3f7dc2c4; BYTE $0xc0         // vpmaxud    ymm0, ymm0, ymm8
+	LONG $0x3f65c2c4; BYTE $0xd9         // vpmaxud    ymm3, ymm3, ymm9
+	QUAD $0x0000e087846f7ec5; BYTE $0x00 // vmovdqu    ymm8, yword [rdi + 4*rax + 224]
+	QUAD $0x0000c0878c6f7ec5; BYTE $0x00 // vmovdqu    ymm9, yword [rdi + 4*rax + 192]
+	QUAD $0x00008087946f7ec5; BYTE $0x00 // vmovdqu    ymm10, yword [rdi + 4*rax + 128]
+	QUAD $0x0000a0879c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 4*rax + 160]
+	QUAD $0x00010087a46f7ec5; BYTE $0x00 // vmovdqu    ymm12, yword [rdi + 4*rax + 256]
+	QUAD $0x00014087ac6f7ec5; BYTE $0x00 // vmovdqu    ymm13, yword [rdi + 4*rax + 320]
+	QUAD $0x00016087b46f7ec5; BYTE $0x00 // vmovdqu    ymm14, yword [rdi + 4*rax + 352]
+	LONG $0x3b3d42c4; BYTE $0xfe         // vpminud    ymm15, ymm8, ymm14
+	LONG $0x3b4dc2c4; BYTE $0xf7         // vpminud    ymm6, ymm6, ymm15
+	LONG $0x347ffdc5; BYTE $0x24         // vmovdqa    yword [rsp], ymm6
+	LONG $0x3b3542c4; BYTE $0xfd         // vpminud    ymm15, ymm9, ymm13
+	LONG $0x3b5dc2c4; BYTE $0xe7         // vpminud    ymm4, ymm4, ymm15
+	LONG $0x3b2d42c4; BYTE $0xfc         // vpminud    ymm15, ymm10, ymm12
+	LONG $0x3b75c2c4; BYTE $0xcf         // vpminud    ymm1, ymm1, ymm15
+	QUAD $0x00012087bc6f7ec5; BYTE $0x00 // vmovdqu    ymm15, yword [rdi + 4*rax + 288]
+	LONG $0x3b25c2c4; BYTE $0xf7         // vpminud    ymm6, ymm11, ymm15
+	LONG $0x3b6de2c4; BYTE $0xd6         // vpminud    ymm2, ymm2, ymm6
+	LONG $0x3f3dc2c4; BYTE $0xf6         // vpmaxud    ymm6, ymm8, ymm14
+	LONG $0x3f45e2c4; BYTE $0xfe         // vpmaxud    ymm7, ymm7, ymm6
+	LONG $0x3f35c2c4; BYTE $0xf5         // vpmaxud    ymm6, ymm9, ymm13
+	LONG $0x3f55e2c4; BYTE $0xee         // vpmaxud    ymm5, ymm5, ymm6
+	LONG $0x3f2dc2c4; BYTE $0xf4         // vpmaxud    ymm6, ymm10, ymm12
+	LONG $0x3f7de2c4; BYTE $0xc6         // vpmaxud    ymm0, ymm0, ymm6
+	LONG $0x3f25c2c4; BYTE $0xf7         // vpmaxud    ymm6, ymm11, ymm15
+	LONG $0x3f65e2c4; BYTE $0xde         // vpmaxud    ymm3, ymm3, ymm6
+	QUAD $0x0001a087b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 4*rax + 416]
+	LONG $0x3b6de2c4; BYTE $0xd6         // vpminud    ymm2, ymm2, ymm6
+	LONG $0x3f65e2c4; BYTE $0xde         // vpmaxud    ymm3, ymm3, ymm6
+	QUAD $0x00018087b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 4*rax + 384]
+	LONG $0x3b75e2c4; BYTE $0xce         // vpminud    ymm1, ymm1, ymm6
+	LONG $0x3f7de2c4; BYTE $0xc6         // vpmaxud    ymm0, ymm0, ymm6
+	QUAD $0x0001c087b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 4*rax + 448]
+	LONG $0x3b5de2c4; BYTE $0xe6         // vpminud    ymm4, ymm4, ymm6
+	LONG $0x3f55e2c4; BYTE $0xee         // vpmaxud    ymm5, ymm5, ymm6
+	QUAD $0x0001e087846f7ec5; BYTE $0x00 // vmovdqu    ymm8, yword [rdi + 4*rax + 480]
+	LONG $0x3b3de2c4; WORD $0x2434       // vpminud    ymm6, ymm8, yword [rsp]
+	LONG $0x3f45c2c4; BYTE $0xf8         // vpmaxud    ymm7, ymm7, ymm8
+	LONG $0x80e88348                     // sub    rax, -128
+	LONG $0x04c28349                     // add    r10, 4
+	JNE  LBB1_9
+
+LBB1_10:
+	WORD $0x854d; BYTE $0xc9 // test    r9, r9
+	JE   LBB1_13
+	LONG $0x87048d48         // lea    rax, [rdi + 4*rax]
+	WORD $0xf749; BYTE $0xd9 // neg    r9
+
+LBB1_12:
+	LONG $0x006f7ec5             // vmovdqu    ymm8, yword [rax]
+	LONG $0x486f7ec5; BYTE $0x20 // vmovdqu    ymm9, yword [rax + 32]
+	LONG $0x506f7ec5; BYTE $0x40 // vmovdqu    ymm10, yword [rax + 64]
+	LONG $0x586f7ec5; BYTE $0x60 // vmovdqu    ymm11, yword [rax + 96]
+	LONG $0x3b6dc2c4; BYTE $0xd1 // vpminud    ymm2, ymm2, ymm9
+	LONG $0x3b75c2c4; BYTE $0xc8 // vpminud    ymm1, ymm1, ymm8
+	LONG $0x3b5dc2c4; BYTE $0xe2 // vpminud    ymm4, ymm4, ymm10
+	LONG $0x3b4dc2c4; BYTE $0xf3 // vpminud    ymm6, ymm6, ymm11
+	LONG $0x3f65c2c4; BYTE $0xd9 // vpmaxud    ymm3, ymm3, ymm9
+	LONG $0x3f7dc2c4; BYTE $0xc0 // vpmaxud    ymm0, ymm0, ymm8
+	LONG $0x3f55c2c4; BYTE $0xea // vpmaxud    ymm5, ymm5, ymm10
+	LONG $0x3f45c2c4; BYTE $0xfb // vpmaxud    ymm7, ymm7, ymm11
+	LONG $0x80e88348             // sub    rax, -128
+	WORD $0xff49; BYTE $0xc1     // inc    r9
+	JNE  LBB1_12
+
+LBB1_13:
+	LONG $0x3b6de2c4; BYTE $0xd6   // vpminud    ymm2, ymm2, ymm6
+	LONG $0x3b75e2c4; BYTE $0xcc   // vpminud    ymm1, ymm1, ymm4
+	LONG $0x3b75e2c4; BYTE $0xca   // vpminud    ymm1, ymm1, ymm2
+	LONG $0x3f65e2c4; BYTE $0xd7   // vpmaxud    ymm2, ymm3, ymm7
+	LONG $0x3f7de2c4; BYTE $0xc5   // vpmaxud    ymm0, ymm0, ymm5
+	LONG $0x3f7de2c4; BYTE $0xc2   // vpmaxud    ymm0, ymm0, ymm2
+	LONG $0x397de3c4; WORD $0x01c2 // vextracti128    xmm2, ymm0, 1
+	LONG $0x3f79e2c4; BYTE $0xc2   // vpmaxud    xmm0, xmm0, xmm2
+	LONG $0xd070f9c5; BYTE $0x4e   // vpshufd    xmm2, xmm0, 78
+	LONG $0x3f79e2c4; BYTE $0xc2   // vpmaxud    xmm0, xmm0, xmm2
+	LONG $0xd070f9c5; BYTE $0xe5   // vpshufd    xmm2, xmm0, 229
+	LONG $0x3f79e2c4; BYTE $0xc2   // vpmaxud    xmm0, xmm0, xmm2
+	LONG $0xc67ef9c5               // vmovd    esi, xmm0
+	LONG $0x397de3c4; WORD $0x01c8 // vextracti128    xmm0, ymm1, 1
+	LONG $0x3b71e2c4; BYTE $0xc0   // vpminud    xmm0, xmm1, xmm0
+	LONG $0xc870f9c5; BYTE $0x4e   // vpshufd    xmm1, xmm0, 78
+	LONG $0x3b79e2c4; BYTE $0xc1   // vpminud    xmm0, xmm0, xmm1
+	LONG $0xc870f9c5; BYTE $0xe5   // vpshufd    xmm1, xmm0, 229
+	LONG $0x3b79e2c4; BYTE $0xc1   // vpminud    xmm0, xmm0, xmm1
+	LONG $0x7e79c1c4; BYTE $0xc1   // vmovd    r9d, xmm0
+	WORD $0x394d; BYTE $0xc3       // cmp    r11, r8
+	JE   LBB1_14
+
+LBB1_4:
+	WORD $0xf089 // mov    eax, esi
+
+LBB1_5:
+	LONG $0x9f348b42         // mov    esi, dword [rdi + 4*r11]
+	WORD $0x3941; BYTE $0xf1 // cmp    r9d, esi
+	LONG $0xce430f44         // cmovae    r9d, esi
+	WORD $0xf039             // cmp    eax, esi
+	WORD $0x470f; BYTE $0xf0 // cmova    esi, eax
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0xf089             // mov    eax, esi
+	WORD $0x394d; BYTE $0xd8 // cmp    r8, r11
+	JNE  LBB1_5
+
+LBB1_14:
+	WORD $0x3189             // mov    dword [rcx], esi
+	WORD $0x8944; BYTE $0x0a // mov    dword [rdx], r9d
+	SUBQ $8, SP
+	VZEROUPPER
+	RET
+
+DATA LCDATA2<>+0x000(SB)/8, $0x8000000000000000
+DATA LCDATA2<>+0x008(SB)/8, $0x7fffffffffffffff
+GLOBL LCDATA2<>(SB), 8, $16
+
+TEXT ·_int64_max_min_avx2(SB), $232-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+	ADDQ $8, SP
+	LEAQ LCDATA2<>(SB), BP
+
+	QUAD $0xffffffffffffb949; WORD $0x7fff // mov    r9, 9223372036854775807
+	WORD $0xf685                           // test    esi, esi
+	JLE  LBB2_1
+	WORD $0x8941; BYTE $0xf0               // mov    r8d, esi
+	WORD $0xfe83; BYTE $0x1f               // cmp    esi, 31
+	JA   LBB2_6
+	LONG $0x01718d49                       // lea    rsi, [r9 + 1]
+	WORD $0x3145; BYTE $0xdb               // xor    r11d, r11d
+	JMP  LBB2_4
+
+LBB2_1:
+	LONG $0x01718d49 // lea    rsi, [r9 + 1]
+	JMP  LBB2_14
+
+LBB2_6:
+	WORD $0x8945; BYTE $0xc3       // mov    r11d, r8d
+	LONG $0xe0e38341               // and    r11d, -32
+	LONG $0xe0438d49               // lea    rax, [r11 - 32]
+	WORD $0x8949; BYTE $0xc2       // mov    r10, rax
+	LONG $0x05eac149               // shr    r10, 5
+	LONG $0x01c28349               // add    r10, 1
+	WORD $0x8945; BYTE $0xd1       // mov    r9d, r10d
+	LONG $0x03e18341               // and    r9d, 3
+	LONG $0x60f88348               // cmp    rax, 96
+	JAE  LBB2_8
+	LONG $0x597d62c4; WORD $0x007d // vpbroadcastq    ymm15, qword 0[rbp] /* [rip + .LCPI2_0] */
+	LONG $0x597d62c4; WORD $0x085d // vpbroadcastq    ymm11, qword 8[rbp] /* [rip + .LCPI2_1] */
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0x5c7f7dc5; WORD $0x2024 // vmovdqa    yword [rsp + 32], ymm11
+	LONG $0x6f7dc1c4; BYTE $0xdb   // vmovdqa    ymm3, ymm11
+	LONG $0x6f7d41c4; BYTE $0xcb   // vmovdqa    ymm9, ymm11
+	LONG $0x6f7dc1c4; BYTE $0xeb   // vmovdqa    ymm5, ymm11
+	LONG $0x6f7dc1c4; BYTE $0xe3   // vmovdqa    ymm4, ymm11
+	LONG $0x6f7dc1c4; BYTE $0xf3   // vmovdqa    ymm6, ymm11
+	LONG $0x5c7f7dc5; WORD $0x6024 // vmovdqa    yword [rsp + 96], ymm11
+	LONG $0x7c7f7dc5; WORD $0x4024 // vmovdqa    yword [rsp + 64], ymm15
+	LONG $0x6f7dc1c4; BYTE $0xd7   // vmovdqa    ymm2, ymm15
+	LONG $0x6f7d41c4; BYTE $0xc7   // vmovdqa    ymm8, ymm15
+	LONG $0x6f7d41c4; BYTE $0xe7   // vmovdqa    ymm12, ymm15
+	LONG $0x6f7d41c4; BYTE $0xef   // vmovdqa    ymm13, ymm15
+	LONG $0x6f7d41c4; BYTE $0xf7   // vmovdqa    ymm14, ymm15
+	LONG $0x3c7f7dc5; BYTE $0x24   // vmovdqa    yword [rsp], ymm15
+	JMP  LBB2_10
+
+LBB2_8:
+	LONG $0xfce28349               // and    r10, -4
+	LONG $0x597d62c4; WORD $0x007d // vpbroadcastq    ymm15, qword 0[rbp] /* [rip + .LCPI2_0] */
+	WORD $0xf749; BYTE $0xda       // neg    r10
+	LONG $0x597d62c4; WORD $0x085d // vpbroadcastq    ymm11, qword 8[rbp] /* [rip + .LCPI2_1] */
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0x5c7f7dc5; WORD $0x2024 // vmovdqa    yword [rsp + 32], ymm11
+	LONG $0x6f7dc1c4; BYTE $0xdb   // vmovdqa    ymm3, ymm11
+	LONG $0x6f7d41c4; BYTE $0xcb   // vmovdqa    ymm9, ymm11
+	LONG $0x6f7dc1c4; BYTE $0xeb   // vmovdqa    ymm5, ymm11
+	LONG $0x6f7dc1c4; BYTE $0xe3   // vmovdqa    ymm4, ymm11
+	LONG $0x6f7dc1c4; BYTE $0xf3   // vmovdqa    ymm6, ymm11
+	LONG $0x5c7f7dc5; WORD $0x6024 // vmovdqa    yword [rsp + 96], ymm11
+	LONG $0x7c7f7dc5; WORD $0x4024 // vmovdqa    yword [rsp + 64], ymm15
+	LONG $0x6f7dc1c4; BYTE $0xd7   // vmovdqa    ymm2, ymm15
+	LONG $0x6f7d41c4; BYTE $0xc7   // vmovdqa    ymm8, ymm15
+	LONG $0x6f7d41c4; BYTE $0xe7   // vmovdqa    ymm12, ymm15
+	LONG $0x6f7d41c4; BYTE $0xef   // vmovdqa    ymm13, ymm15
+	LONG $0x6f7d41c4; BYTE $0xf7   // vmovdqa    ymm14, ymm15
+	LONG $0x3c7f7dc5; BYTE $0x24   // vmovdqa    yword [rsp], ymm15
+
+LBB2_9:
+	QUAD $0x0000e0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 224]
+	LONG $0x6f7d41c4; BYTE $0xd0         // vmovdqa    ymm10, ymm8
+	LONG $0xc26f7dc5                     // vmovdqa    ymm8, ymm2
+	LONG $0xd36ffdc5                     // vmovdqa    ymm2, ymm3
+	LONG $0x6f7dc1c4; BYTE $0xd9         // vmovdqa    ymm3, ymm9
+	LONG $0x377d42c4; BYTE $0xcb         // vpcmpgtq    ymm9, ymm0, ymm11
+	LONG $0x4b7dc3c4; WORD $0x90cb       // vblendvpd    ymm1, ymm0, ymm11, ymm9
+	QUAD $0x0000a0248c29fdc5; BYTE $0x00 // vmovapd    yword [rsp + 160], ymm1
+	LONG $0x370562c4; BYTE $0xc8         // vpcmpgtq    ymm9, ymm15, ymm0
+	LONG $0x4b7dc3c4; WORD $0x90c7       // vblendvpd    ymm0, ymm0, ymm15, ymm9
+	QUAD $0x000080248429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 128], ymm0
+	QUAD $0x0000c0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 192]
+	LONG $0x377d62c4; BYTE $0xce         // vpcmpgtq    ymm9, ymm0, ymm6
+	LONG $0x4b7de3c4; WORD $0x90fe       // vblendvpd    ymm7, ymm0, ymm6, ymm9
+	LONG $0x370d62c4; BYTE $0xc8         // vpcmpgtq    ymm9, ymm14, ymm0
+	LONG $0x4b7d43c4; WORD $0x90f6       // vblendvpd    ymm14, ymm0, ymm14, ymm9
+	QUAD $0x0000a0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 160]
+	LONG $0x377d62c4; BYTE $0xcc         // vpcmpgtq    ymm9, ymm0, ymm4
+	LONG $0x4b7de3c4; WORD $0x90f4       // vblendvpd    ymm6, ymm0, ymm4, ymm9
+	LONG $0x371562c4; BYTE $0xc8         // vpcmpgtq    ymm9, ymm13, ymm0
+	LONG $0x4b7d43c4; WORD $0x90ed       // vblendvpd    ymm13, ymm0, ymm13, ymm9
+	QUAD $0x000080c78c6f7ec5; BYTE $0x00 // vmovdqu    ymm9, yword [rdi + 8*rax + 128]
+	LONG $0x3735e2c4; BYTE $0xc5         // vpcmpgtq    ymm0, ymm9, ymm5
+	LONG $0x4b35e3c4; WORD $0x00cd       // vblendvpd    ymm1, ymm9, ymm5, ymm0
+	LONG $0x371dc2c4; BYTE $0xe9         // vpcmpgtq    ymm5, ymm12, ymm9
+	LONG $0x4b3543c4; WORD $0x50e4       // vblendvpd    ymm12, ymm9, ymm12, ymm5
+	LONG $0x6c6ffec5; WORD $0x60c7       // vmovdqu    ymm5, yword [rdi + 8*rax + 96]
+	LONG $0x375562c4; BYTE $0xcb         // vpcmpgtq    ymm9, ymm5, ymm3
+	LONG $0x4b5563c4; WORD $0x90cb       // vblendvpd    ymm9, ymm5, ymm3, ymm9
+	LONG $0x372de2c4; BYTE $0xe5         // vpcmpgtq    ymm4, ymm10, ymm5
+	LONG $0x4b5543c4; WORD $0x40d2       // vblendvpd    ymm10, ymm5, ymm10, ymm4
+	LONG $0x646ffec5; WORD $0x40c7       // vmovdqu    ymm4, yword [rdi + 8*rax + 64]
+	LONG $0x375de2c4; BYTE $0xea         // vpcmpgtq    ymm5, ymm4, ymm2
+	LONG $0x4b5de3c4; WORD $0x50ea       // vblendvpd    ymm5, ymm4, ymm2, ymm5
+	LONG $0x373de2c4; BYTE $0xdc         // vpcmpgtq    ymm3, ymm8, ymm4
+	LONG $0x4b5dc3c4; WORD $0x30c0       // vblendvpd    ymm0, ymm4, ymm8, ymm3
+	LONG $0x146ffec5; BYTE $0xc7         // vmovdqu    ymm2, yword [rdi + 8*rax]
+	LONG $0x646ffdc5; WORD $0x6024       // vmovdqa    ymm4, yword [rsp + 96]
+	LONG $0x376de2c4; BYTE $0xdc         // vpcmpgtq    ymm3, ymm2, ymm4
+	LONG $0x4b6de3c4; WORD $0x30dc       // vblendvpd    ymm3, ymm2, ymm4, ymm3
+	LONG $0x1c6f7dc5; BYTE $0x24         // vmovdqa    ymm11, yword [rsp]
+	LONG $0x3725e2c4; BYTE $0xe2         // vpcmpgtq    ymm4, ymm11, ymm2
+	LONG $0x4b6dc3c4; WORD $0x40e3       // vblendvpd    ymm4, ymm2, ymm11, ymm4
+	LONG $0x546ffec5; WORD $0x20c7       // vmovdqu    ymm2, yword [rdi + 8*rax + 32]
+	LONG $0x7c6f7dc5; WORD $0x2024       // vmovdqa    ymm15, yword [rsp + 32]
+	LONG $0x376d42c4; BYTE $0xdf         // vpcmpgtq    ymm11, ymm2, ymm15
+	LONG $0x4b6d43c4; WORD $0xb0df       // vblendvpd    ymm11, ymm2, ymm15, ymm11
+	LONG $0x446f7dc5; WORD $0x4024       // vmovdqa    ymm8, yword [rsp + 64]
+	LONG $0x373d62c4; BYTE $0xfa         // vpcmpgtq    ymm15, ymm8, ymm2
+	LONG $0x4b6dc3c4; WORD $0xf0d0       // vblendvpd    ymm2, ymm2, ymm8, ymm15
+	QUAD $0x000120c7846f7ec5; BYTE $0x00 // vmovdqu    ymm8, yword [rdi + 8*rax + 288]
+	LONG $0x373d42c4; BYTE $0xfb         // vpcmpgtq    ymm15, ymm8, ymm11
+	LONG $0x4b3d43c4; WORD $0xf0db       // vblendvpd    ymm11, ymm8, ymm11, ymm15
+	LONG $0x5c297dc5; WORD $0x2024       // vmovapd    yword [rsp + 32], ymm11
+	LONG $0x376d42c4; BYTE $0xd8         // vpcmpgtq    ymm11, ymm2, ymm8
+	LONG $0x4b3de3c4; WORD $0xb0d2       // vblendvpd    ymm2, ymm8, ymm2, ymm11
+	LONG $0x1429fdc5; BYTE $0x24         // vmovapd    yword [rsp], ymm2
+	QUAD $0x000100c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 256]
+	LONG $0x3725e2c4; BYTE $0xd3         // vpcmpgtq    ymm2, ymm11, ymm3
+	LONG $0x4b2563c4; WORD $0x20c3       // vblendvpd    ymm8, ymm11, ymm3, ymm2
+	LONG $0x375dc2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm4, ymm11
+	LONG $0x4b25e3c4; WORD $0x30dc       // vblendvpd    ymm3, ymm11, ymm4, ymm3
+	QUAD $0x000140c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 320]
+	LONG $0x3725e2c4; BYTE $0xe5         // vpcmpgtq    ymm4, ymm11, ymm5
+	LONG $0x4b25e3c4; WORD $0x40e5       // vblendvpd    ymm4, ymm11, ymm5, ymm4
+	LONG $0x377dc2c4; BYTE $0xeb         // vpcmpgtq    ymm5, ymm0, ymm11
+	LONG $0x4b25e3c4; WORD $0x50e8       // vblendvpd    ymm5, ymm11, ymm0, ymm5
+	QUAD $0x000160c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 352]
+	LONG $0x377d42c4; BYTE $0xd9         // vpcmpgtq    ymm11, ymm0, ymm9
+	LONG $0x4b7d43c4; WORD $0xb0c9       // vblendvpd    ymm9, ymm0, ymm9, ymm11
+	LONG $0x372d62c4; BYTE $0xd8         // vpcmpgtq    ymm11, ymm10, ymm0
+	LONG $0x4b7d43c4; WORD $0xb0d2       // vblendvpd    ymm10, ymm0, ymm10, ymm11
+	QUAD $0x000180c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 384]
+	LONG $0x3725e2c4; BYTE $0xc1         // vpcmpgtq    ymm0, ymm11, ymm1
+	LONG $0x4b25e3c4; WORD $0x00d1       // vblendvpd    ymm2, ymm11, ymm1, ymm0
+	LONG $0x371dc2c4; BYTE $0xcb         // vpcmpgtq    ymm1, ymm12, ymm11
+	LONG $0x4b2543c4; WORD $0x10e4       // vblendvpd    ymm12, ymm11, ymm12, ymm1
+	QUAD $0x0001a0c78c6ffec5; BYTE $0x00 // vmovdqu    ymm1, yword [rdi + 8*rax + 416]
+	LONG $0x377562c4; BYTE $0xde         // vpcmpgtq    ymm11, ymm1, ymm6
+	LONG $0x4b75e3c4; WORD $0xb0f6       // vblendvpd    ymm6, ymm1, ymm6, ymm11
+	LONG $0x371562c4; BYTE $0xd9         // vpcmpgtq    ymm11, ymm13, ymm1
+	LONG $0x4b75c3c4; WORD $0xb0cd       // vblendvpd    ymm1, ymm1, ymm13, ymm11
+	QUAD $0x0001c0c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 448]
+	LONG $0x372562c4; BYTE $0xef         // vpcmpgtq    ymm13, ymm11, ymm7
+	LONG $0x4b25e3c4; WORD $0xd0ff       // vblendvpd    ymm7, ymm11, ymm7, ymm13
+	LONG $0x370d42c4; BYTE $0xeb         // vpcmpgtq    ymm13, ymm14, ymm11
+	LONG $0x4b2543c4; WORD $0xd0ee       // vblendvpd    ymm13, ymm11, ymm14, ymm13
+	QUAD $0x0001e0c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 480]
+	QUAD $0x0000a024846ffdc5; BYTE $0x00 // vmovdqa    ymm0, yword [rsp + 160]
+	LONG $0x372562c4; BYTE $0xf0         // vpcmpgtq    ymm14, ymm11, ymm0
+	LONG $0x4b2563c4; WORD $0xe0f0       // vblendvpd    ymm14, ymm11, ymm0, ymm14
+	QUAD $0x00008024846ffdc5; BYTE $0x00 // vmovdqa    ymm0, yword [rsp + 128]
+	LONG $0x377d42c4; BYTE $0xfb         // vpcmpgtq    ymm15, ymm0, ymm11
+	LONG $0x4b2563c4; WORD $0xf0f8       // vblendvpd    ymm15, ymm11, ymm0, ymm15
+	QUAD $0x0002e0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 736]
+	LONG $0x377d42c4; BYTE $0xde         // vpcmpgtq    ymm11, ymm0, ymm14
+	LONG $0x4b7d43c4; WORD $0xb0de       // vblendvpd    ymm11, ymm0, ymm14, ymm11
+	QUAD $0x0000a0249c297dc5; BYTE $0x00 // vmovapd    yword [rsp + 160], ymm11
+	LONG $0x370562c4; BYTE $0xf0         // vpcmpgtq    ymm14, ymm15, ymm0
+	LONG $0x4b7dc3c4; WORD $0xe0c7       // vblendvpd    ymm0, ymm0, ymm15, ymm14
+	QUAD $0x000080248429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 128], ymm0
+	QUAD $0x0002c0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 704]
+	LONG $0x377d62c4; BYTE $0xf7         // vpcmpgtq    ymm14, ymm0, ymm7
+	LONG $0x4b7de3c4; WORD $0xe0ff       // vblendvpd    ymm7, ymm0, ymm7, ymm14
+	LONG $0x371562c4; BYTE $0xf0         // vpcmpgtq    ymm14, ymm13, ymm0
+	LONG $0x4b7d43c4; WORD $0xe0f5       // vblendvpd    ymm14, ymm0, ymm13, ymm14
+	QUAD $0x0002a0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 672]
+	LONG $0x377d62c4; BYTE $0xee         // vpcmpgtq    ymm13, ymm0, ymm6
+	LONG $0x4b7de3c4; WORD $0xd0f6       // vblendvpd    ymm6, ymm0, ymm6, ymm13
+	LONG $0x377562c4; BYTE $0xe8         // vpcmpgtq    ymm13, ymm1, ymm0
+	LONG $0x4b7d63c4; WORD $0xd0e9       // vblendvpd    ymm13, ymm0, ymm1, ymm13
+	QUAD $0x000280c78c6ffec5; BYTE $0x00 // vmovdqu    ymm1, yword [rdi + 8*rax + 640]
+	LONG $0x3775e2c4; BYTE $0xc2         // vpcmpgtq    ymm0, ymm1, ymm2
+	LONG $0x4b75e3c4; WORD $0x00c2       // vblendvpd    ymm0, ymm1, ymm2, ymm0
+	LONG $0x371de2c4; BYTE $0xd1         // vpcmpgtq    ymm2, ymm12, ymm1
+	LONG $0x4b7543c4; WORD $0x20e4       // vblendvpd    ymm12, ymm1, ymm12, ymm2
+	QUAD $0x000260c78c6ffec5; BYTE $0x00 // vmovdqu    ymm1, yword [rdi + 8*rax + 608]
+	LONG $0x3775c2c4; BYTE $0xd1         // vpcmpgtq    ymm2, ymm1, ymm9
+	LONG $0x4b7543c4; WORD $0x20c9       // vblendvpd    ymm9, ymm1, ymm9, ymm2
+	LONG $0x372de2c4; BYTE $0xd1         // vpcmpgtq    ymm2, ymm10, ymm1
+	LONG $0x4b7543c4; WORD $0x20d2       // vblendvpd    ymm10, ymm1, ymm10, ymm2
+	QUAD $0x000240c78c6ffec5; BYTE $0x00 // vmovdqu    ymm1, yword [rdi + 8*rax + 576]
+	LONG $0x3775e2c4; BYTE $0xd4         // vpcmpgtq    ymm2, ymm1, ymm4
+	LONG $0x4b75e3c4; WORD $0x20d4       // vblendvpd    ymm2, ymm1, ymm4, ymm2
+	LONG $0x3755e2c4; BYTE $0xe1         // vpcmpgtq    ymm4, ymm5, ymm1
+	LONG $0x4b75e3c4; WORD $0x40cd       // vblendvpd    ymm1, ymm1, ymm5, ymm4
+	QUAD $0x000200c7a46ffec5; BYTE $0x00 // vmovdqu    ymm4, yword [rdi + 8*rax + 512]
+	LONG $0x375dc2c4; BYTE $0xe8         // vpcmpgtq    ymm5, ymm4, ymm8
+	LONG $0x4b5dc3c4; WORD $0x50e8       // vblendvpd    ymm5, ymm4, ymm8, ymm5
+	LONG $0x376562c4; BYTE $0xc4         // vpcmpgtq    ymm8, ymm3, ymm4
+	LONG $0x4b5de3c4; WORD $0x80db       // vblendvpd    ymm3, ymm4, ymm3, ymm8
+	QUAD $0x000220c7a46ffec5; BYTE $0x00 // vmovdqu    ymm4, yword [rdi + 8*rax + 544]
+	LONG $0x5c6f7dc5; WORD $0x2024       // vmovdqa    ymm11, yword [rsp + 32]
+	LONG $0x375d42c4; BYTE $0xc3         // vpcmpgtq    ymm8, ymm4, ymm11
+	LONG $0x4b5d43c4; WORD $0x80c3       // vblendvpd    ymm8, ymm4, ymm11, ymm8
+	LONG $0x3c6f7dc5; BYTE $0x24         // vmovdqa    ymm15, yword [rsp]
+	LONG $0x370562c4; BYTE $0xdc         // vpcmpgtq    ymm11, ymm15, ymm4
+	LONG $0x4b5dc3c4; WORD $0xb0e7       // vblendvpd    ymm4, ymm4, ymm15, ymm11
+	QUAD $0x000320c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 800]
+	LONG $0x372542c4; BYTE $0xf8         // vpcmpgtq    ymm15, ymm11, ymm8
+	LONG $0x4b2543c4; WORD $0xf0c0       // vblendvpd    ymm8, ymm11, ymm8, ymm15
+	LONG $0x44297dc5; WORD $0x2024       // vmovapd    yword [rsp + 32], ymm8
+	LONG $0x375d42c4; BYTE $0xc3         // vpcmpgtq    ymm8, ymm4, ymm11
+	LONG $0x4b25e3c4; WORD $0x80e4       // vblendvpd    ymm4, ymm11, ymm4, ymm8
+	LONG $0x6429fdc5; WORD $0x4024       // vmovapd    yword [rsp + 64], ymm4
+	QUAD $0x000300c7a46ffec5; BYTE $0x00 // vmovdqu    ymm4, yword [rdi + 8*rax + 768]
+	LONG $0x375d62c4; BYTE $0xdd         // vpcmpgtq    ymm11, ymm4, ymm5
+	LONG $0x4b5de3c4; WORD $0xb0ed       // vblendvpd    ymm5, ymm4, ymm5, ymm11
+	LONG $0x6c29fdc5; WORD $0x6024       // vmovapd    yword [rsp + 96], ymm5
+	LONG $0x3765e2c4; BYTE $0xec         // vpcmpgtq    ymm5, ymm3, ymm4
+	LONG $0x4b5de3c4; WORD $0x50db       // vblendvpd    ymm3, ymm4, ymm3, ymm5
+	LONG $0x1c29fdc5; BYTE $0x24         // vmovapd    yword [rsp], ymm3
+	QUAD $0x000340c7a46ffec5; BYTE $0x00 // vmovdqu    ymm4, yword [rdi + 8*rax + 832]
+	LONG $0x375de2c4; BYTE $0xda         // vpcmpgtq    ymm3, ymm4, ymm2
+	LONG $0x4b5de3c4; WORD $0x30da       // vblendvpd    ymm3, ymm4, ymm2, ymm3
+	LONG $0x3775e2c4; BYTE $0xd4         // vpcmpgtq    ymm2, ymm1, ymm4
+	LONG $0x4b5de3c4; WORD $0x20d1       // vblendvpd    ymm2, ymm4, ymm1, ymm2
+	QUAD $0x000360c78c6ffec5; BYTE $0x00 // vmovdqu    ymm1, yword [rdi + 8*rax + 864]
+	LONG $0x3775c2c4; BYTE $0xe1         // vpcmpgtq    ymm4, ymm1, ymm9
+	LONG $0x4b7543c4; WORD $0x40c9       // vblendvpd    ymm9, ymm1, ymm9, ymm4
+	LONG $0x372de2c4; BYTE $0xe9         // vpcmpgtq    ymm5, ymm10, ymm1
+	LONG $0x4b7543c4; WORD $0x50c2       // vblendvpd    ymm8, ymm1, ymm10, ymm5
+	QUAD $0x000380c78c6ffec5; BYTE $0x00 // vmovdqu    ymm1, yword [rdi + 8*rax + 896]
+	LONG $0x3775e2c4; BYTE $0xe8         // vpcmpgtq    ymm5, ymm1, ymm0
+	LONG $0x4b75e3c4; WORD $0x50e8       // vblendvpd    ymm5, ymm1, ymm0, ymm5
+	LONG $0x371de2c4; BYTE $0xc1         // vpcmpgtq    ymm0, ymm12, ymm1
+	LONG $0x4b7543c4; WORD $0x00e4       // vblendvpd    ymm12, ymm1, ymm12, ymm0
+	QUAD $0x0003a0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 928]
+	LONG $0x377de2c4; BYTE $0xce         // vpcmpgtq    ymm1, ymm0, ymm6
+	LONG $0x4b7de3c4; WORD $0x10e6       // vblendvpd    ymm4, ymm0, ymm6, ymm1
+	LONG $0x3715e2c4; BYTE $0xc8         // vpcmpgtq    ymm1, ymm13, ymm0
+	LONG $0x4b7d43c4; WORD $0x10ed       // vblendvpd    ymm13, ymm0, ymm13, ymm1
+	QUAD $0x0003c0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 960]
+	LONG $0x377de2c4; BYTE $0xcf         // vpcmpgtq    ymm1, ymm0, ymm7
+	LONG $0x4b7de3c4; WORD $0x10f7       // vblendvpd    ymm6, ymm0, ymm7, ymm1
+	LONG $0x370de2c4; BYTE $0xc8         // vpcmpgtq    ymm1, ymm14, ymm0
+	LONG $0x4b7d43c4; WORD $0x10f6       // vblendvpd    ymm14, ymm0, ymm14, ymm1
+	QUAD $0x0003e0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 992]
+	QUAD $0x0000a024bc6ffdc5; BYTE $0x00 // vmovdqa    ymm7, yword [rsp + 160]
+	LONG $0x377de2c4; BYTE $0xcf         // vpcmpgtq    ymm1, ymm0, ymm7
+	LONG $0x4b7d63c4; WORD $0x10df       // vblendvpd    ymm11, ymm0, ymm7, ymm1
+	QUAD $0x00008024bc6ffdc5; BYTE $0x00 // vmovdqa    ymm7, yword [rsp + 128]
+	LONG $0x3745e2c4; BYTE $0xc8         // vpcmpgtq    ymm1, ymm7, ymm0
+	LONG $0x4b7d63c4; WORD $0x10ff       // vblendvpd    ymm15, ymm0, ymm7, ymm1
+	LONG $0x80e88348                     // sub    rax, -128
+	LONG $0x04c28349                     // add    r10, 4
+	JNE  LBB2_9
+
+LBB2_10:
+	WORD $0x854d; BYTE $0xc9       // test    r9, r9
+	LONG $0xfd6ffdc5               // vmovdqa    ymm7, ymm5
+	LONG $0x6f7dc1c4; BYTE $0xe9   // vmovdqa    ymm5, ymm9
+	LONG $0x4c6f7dc5; WORD $0x6024 // vmovdqa    ymm9, yword [rsp + 96]
+	LONG $0xd36f7dc5               // vmovdqa    ymm10, ymm3
+	JE   LBB2_13
+	LONG $0xc7048d48               // lea    rax, [rdi + 8*rax]
+	WORD $0xf749; BYTE $0xd9       // neg    r9
+
+LBB2_12:
+	LONG $0x406ffec5; BYTE $0x20   // vmovdqu    ymm0, yword [rax + 32]
+	LONG $0x5c6ffdc5; WORD $0x2024 // vmovdqa    ymm3, yword [rsp + 32]
+	LONG $0x377de2c4; BYTE $0xcb   // vpcmpgtq    ymm1, ymm0, ymm3
+	LONG $0x4b7de3c4; WORD $0x10db // vblendvpd    ymm3, ymm0, ymm3, ymm1
+	LONG $0x5c29fdc5; WORD $0x2024 // vmovapd    yword [rsp + 32], ymm3
+	LONG $0x5c6ffdc5; WORD $0x4024 // vmovdqa    ymm3, yword [rsp + 64]
+	LONG $0x3765e2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm3, ymm0
+	LONG $0x4b7de3c4; WORD $0x10db // vblendvpd    ymm3, ymm0, ymm3, ymm1
+	LONG $0x5c29fdc5; WORD $0x4024 // vmovapd    yword [rsp + 64], ymm3
+	LONG $0x006ffec5               // vmovdqu    ymm0, yword [rax]
+	LONG $0x377dc2c4; BYTE $0xc9   // vpcmpgtq    ymm1, ymm0, ymm9
+	LONG $0x4b7d43c4; WORD $0x10c9 // vblendvpd    ymm9, ymm0, ymm9, ymm1
+	LONG $0x1c6ffdc5; BYTE $0x24   // vmovdqa    ymm3, yword [rsp]
+	LONG $0x3765e2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm3, ymm0
+	LONG $0x4b7de3c4; WORD $0x10db // vblendvpd    ymm3, ymm0, ymm3, ymm1
+	LONG $0x1c29fdc5; BYTE $0x24   // vmovapd    yword [rsp], ymm3
+	LONG $0x406ffec5; BYTE $0x40   // vmovdqu    ymm0, yword [rax + 64]
+	LONG $0x377dc2c4; BYTE $0xca   // vpcmpgtq    ymm1, ymm0, ymm10
+	LONG $0x4b7d43c4; WORD $0x10d2 // vblendvpd    ymm10, ymm0, ymm10, ymm1
+	LONG $0x376de2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm2, ymm0
+	LONG $0x4b7de3c4; WORD $0x10d2 // vblendvpd    ymm2, ymm0, ymm2, ymm1
+	LONG $0x406ffec5; BYTE $0x60   // vmovdqu    ymm0, yword [rax + 96]
+	LONG $0x377de2c4; BYTE $0xcd   // vpcmpgtq    ymm1, ymm0, ymm5
+	LONG $0x4b7de3c4; WORD $0x10ed // vblendvpd    ymm5, ymm0, ymm5, ymm1
+	LONG $0x373de2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm8, ymm0
+	LONG $0x4b7d43c4; WORD $0x10c0 // vblendvpd    ymm8, ymm0, ymm8, ymm1
+	QUAD $0x00000080806ffec5       // vmovdqu    ymm0, yword [rax + 128]
+	LONG $0x377de2c4; BYTE $0xcf   // vpcmpgtq    ymm1, ymm0, ymm7
+	LONG $0x4b7de3c4; WORD $0x10ff // vblendvpd    ymm7, ymm0, ymm7, ymm1
+	LONG $0x371de2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm12, ymm0
+	LONG $0x4b7d43c4; WORD $0x10e4 // vblendvpd    ymm12, ymm0, ymm12, ymm1
+	QUAD $0x000000a0806ffec5       // vmovdqu    ymm0, yword [rax + 160]
+	LONG $0x377de2c4; BYTE $0xcc   // vpcmpgtq    ymm1, ymm0, ymm4
+	LONG $0x4b7de3c4; WORD $0x10e4 // vblendvpd    ymm4, ymm0, ymm4, ymm1
+	LONG $0x3715e2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm13, ymm0
+	LONG $0x4b7d43c4; WORD $0x10ed // vblendvpd    ymm13, ymm0, ymm13, ymm1
+	QUAD $0x000000c0806ffec5       // vmovdqu    ymm0, yword [rax + 192]
+	LONG $0x377de2c4; BYTE $0xce   // vpcmpgtq    ymm1, ymm0, ymm6
+	LONG $0x4b7de3c4; WORD $0x10f6 // vblendvpd    ymm6, ymm0, ymm6, ymm1
+	LONG $0x370de2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm14, ymm0
+	LONG $0x4b7d43c4; WORD $0x10f6 // vblendvpd    ymm14, ymm0, ymm14, ymm1
+	QUAD $0x000000e0806ffec5       // vmovdqu    ymm0, yword [rax + 224]
+	LONG $0x377dc2c4; BYTE $0xcb   // vpcmpgtq    ymm1, ymm0, ymm11
+	LONG $0x4b7d43c4; WORD $0x10db // vblendvpd    ymm11, ymm0, ymm11, ymm1
+	LONG $0x3705e2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm15, ymm0
+	LONG $0x4b7d43c4; WORD $0x10ff // vblendvpd    ymm15, ymm0, ymm15, ymm1
+	LONG $0x01000548; WORD $0x0000 // add    rax, 256
+	WORD $0xff49; BYTE $0xc1       // inc    r9
+	JNE  LBB2_12
+
+LBB2_13:
+	LONG $0x4c6ffdc5; WORD $0x4024 // vmovdqa    ymm1, yword [rsp + 64]
+	LONG $0x3775c2c4; BYTE $0xc5   // vpcmpgtq    ymm0, ymm1, ymm13
+	LONG $0x4b15e3c4; WORD $0x00c1 // vblendvpd    ymm0, ymm13, ymm1, ymm0
+	LONG $0x373dc2c4; BYTE $0xcf   // vpcmpgtq    ymm1, ymm8, ymm15
+	LONG $0x4b05c3c4; WORD $0x10c8 // vblendvpd    ymm1, ymm15, ymm8, ymm1
+	LONG $0x1c6ffdc5; BYTE $0x24   // vmovdqa    ymm3, yword [rsp]
+	LONG $0x376542c4; BYTE $0xc4   // vpcmpgtq    ymm8, ymm3, ymm12
+	LONG $0x4b1d63c4; WORD $0x80c3 // vblendvpd    ymm8, ymm12, ymm3, ymm8
+	LONG $0x6f7dc1c4; BYTE $0xd9   // vmovdqa    ymm3, ymm9
+	LONG $0x376d42c4; BYTE $0xce   // vpcmpgtq    ymm9, ymm2, ymm14
+	LONG $0x4b0de3c4; WORD $0x90d2 // vblendvpd    ymm2, ymm14, ymm2, ymm9
+	LONG $0x373d62c4; BYTE $0xca   // vpcmpgtq    ymm9, ymm8, ymm2
+	LONG $0x4b6dc3c4; WORD $0x90d0 // vblendvpd    ymm2, ymm2, ymm8, ymm9
+	LONG $0x377d62c4; BYTE $0xc1   // vpcmpgtq    ymm8, ymm0, ymm1
+	LONG $0x4b75e3c4; WORD $0x80c0 // vblendvpd    ymm0, ymm1, ymm0, ymm8
+	LONG $0x376de2c4; BYTE $0xc8   // vpcmpgtq    ymm1, ymm2, ymm0
+	LONG $0x4b7de3c4; WORD $0x10c2 // vblendvpd    ymm0, ymm0, ymm2, ymm1
+	LONG $0x197de3c4; WORD $0x01c1 // vextractf128    xmm1, ymm0, 1
+	LONG $0x3779e2c4; BYTE $0xd1   // vpcmpgtq    xmm2, xmm0, xmm1
+	LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd    xmm0, xmm1, xmm0, xmm2
+	LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps    xmm1, xmm0, 78
+	LONG $0x3779e2c4; BYTE $0xd1   // vpcmpgtq    xmm2, xmm0, xmm1
+	LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd    xmm0, xmm1, xmm0, xmm2
+	LONG $0x546ffdc5; WORD $0x2024 // vmovdqa    ymm2, yword [rsp + 32]
+	LONG $0x375de2c4; BYTE $0xca   // vpcmpgtq    ymm1, ymm4, ymm2
+	LONG $0x4b5de3c4; WORD $0x10ca // vblendvpd    ymm1, ymm4, ymm2, ymm1
+	LONG $0x3725e2c4; BYTE $0xd5   // vpcmpgtq    ymm2, ymm11, ymm5
+	LONG $0x4b25e3c4; WORD $0x20d5 // vblendvpd    ymm2, ymm11, ymm5, ymm2
+	LONG $0x3745e2c4; BYTE $0xe3   // vpcmpgtq    ymm4, ymm7, ymm3
+	LONG $0x4b45e3c4; WORD $0x40e3 // vblendvpd    ymm4, ymm7, ymm3, ymm4
+	LONG $0x374dc2c4; BYTE $0xea   // vpcmpgtq    ymm5, ymm6, ymm10
+	LONG $0x4b4dc3c4; WORD $0x50da // vblendvpd    ymm3, ymm6, ymm10, ymm5
+	LONG $0x3765e2c4; BYTE $0xec   // vpcmpgtq    ymm5, ymm3, ymm4
+	LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd    ymm3, ymm3, ymm4, ymm5
+	LONG $0x376de2c4; BYTE $0xe1   // vpcmpgtq    ymm4, ymm2, ymm1
+	LONG $0x4b6de3c4; WORD $0x40c9 // vblendvpd    ymm1, ymm2, ymm1, ymm4
+	LONG $0x3775e2c4; BYTE $0xd3   // vpcmpgtq    ymm2, ymm1, ymm3
+	LONG $0x4b75e3c4; WORD $0x20cb // vblendvpd    ymm1, ymm1, ymm3, ymm2
+	LONG $0x197de3c4; WORD $0x01ca // vextractf128    xmm2, ymm1, 1
+	LONG $0x3769e2c4; BYTE $0xd9   // vpcmpgtq    xmm3, xmm2, xmm1
+	LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd    xmm1, xmm2, xmm1, xmm3
+	LONG $0x0479e3c4; WORD $0x4ed1 // vpermilps    xmm2, xmm1, 78
+	LONG $0x3769e2c4; BYTE $0xd9   // vpcmpgtq    xmm3, xmm2, xmm1
+	LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd    xmm1, xmm2, xmm1, xmm3
+	LONG $0x7ef9e1c4; BYTE $0xc6   // vmovq    rsi, xmm0
+	LONG $0x7ef9c1c4; BYTE $0xc9   // vmovq    r9, xmm1
+	WORD $0x394d; BYTE $0xc3       // cmp    r11, r8
+	JE   LBB2_14
+
+LBB2_4:
+	WORD $0x8948; BYTE $0xf0 // mov    rax, rsi
+
+LBB2_5:
+	LONG $0xdf348b4a         // mov    rsi, qword [rdi + 8*r11]
+	WORD $0x3949; BYTE $0xf1 // cmp    r9, rsi
+	LONG $0xce4f0f4c         // cmovg    r9, rsi
+	WORD $0x3948; BYTE $0xf0 // cmp    rax, rsi
+	LONG $0xf04d0f48         // cmovge    rsi, rax
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0x8948; BYTE $0xf0 // mov    rax, rsi
+	WORD $0x394d; BYTE $0xd8 // cmp    r8, r11
+	JNE  LBB2_5
+
+LBB2_14:
+	WORD $0x8948; BYTE $0x31 // mov    qword [rcx], rsi
+	WORD $0x894c; BYTE $0x0a // mov    qword [rdx], r9
+	SUBQ $8, SP
+	VZEROUPPER
+	RET
+
+DATA LCDATA3<>+0x000(SB)/8, $0x8000000000000000
+GLOBL LCDATA3<>(SB), 8, $8
+
+TEXT ·_uint64_max_min_avx2(SB), $296-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+	ADDQ $8, SP
+	LEAQ LCDATA3<>(SB), BP
+
+	WORD $0xf685                               // test    esi, esi
+	JLE  LBB3_1
+	WORD $0x8941; BYTE $0xf0                   // mov    r8d, esi
+	WORD $0xfe83; BYTE $0x1f                   // cmp    esi, 31
+	JA   LBB3_6
+	LONG $0xffc1c749; WORD $0xffff; BYTE $0xff // mov    r9, -1
+	WORD $0x3145; BYTE $0xdb                   // xor    r11d, r11d
+	WORD $0xf631                               // xor    esi, esi
+	JMP  LBB3_4
+
+LBB3_1:
+	LONG $0xffc1c749; WORD $0xffff; BYTE $0xff // mov    r9, -1
+	WORD $0xf631                               // xor    esi, esi
+	JMP  LBB3_14
+
+LBB3_6:
+	WORD $0x8945; BYTE $0xc3       // mov    r11d, r8d
+	LONG $0xe0e38341               // and    r11d, -32
+	LONG $0xe0438d49               // lea    rax, [r11 - 32]
+	WORD $0x8949; BYTE $0xc2       // mov    r10, rax
+	LONG $0x05eac149               // shr    r10, 5
+	LONG $0x01c28349               // add    r10, 1
+	WORD $0x8945; BYTE $0xd1       // mov    r9d, r10d
+	LONG $0x03e18341               // and    r9d, 3
+	LONG $0x60f88348               // cmp    rax, 96
+	JAE  LBB3_8
+	LONG $0xe4efd9c5               // vpxor    xmm4, xmm4, xmm4
+	LONG $0xc076fdc5               // vpcmpeqd    ymm0, ymm0, ymm0
+	LONG $0x447ffdc5; WORD $0x4024 // vmovdqa    yword [rsp + 64], ymm0
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0xc076fdc5               // vpcmpeqd    ymm0, ymm0, ymm0
+	LONG $0x447ffdc5; WORD $0x6024 // vmovdqa    yword [rsp + 96], ymm0
+	LONG $0xed76d5c5               // vpcmpeqd    ymm5, ymm5, ymm5
+	LONG $0xff76c5c5               // vpcmpeqd    ymm7, ymm7, ymm7
+	LONG $0x761d41c4; BYTE $0xe4   // vpcmpeqd    ymm12, ymm12, ymm12
+	LONG $0x762d41c4; BYTE $0xd2   // vpcmpeqd    ymm10, ymm10, ymm10
+	LONG $0x762541c4; BYTE $0xdb   // vpcmpeqd    ymm11, ymm11, ymm11
+	LONG $0x761541c4; BYTE $0xed   // vpcmpeqd    ymm13, ymm13, ymm13
+	LONG $0xc0eff9c5               // vpxor    xmm0, xmm0, xmm0
+	LONG $0x447ffdc5; WORD $0x2024 // vmovdqa    yword [rsp + 32], ymm0
+	LONG $0xc0eff9c5               // vpxor    xmm0, xmm0, xmm0
+	LONG $0x047ffdc5; BYTE $0x24   // vmovdqa    yword [rsp], ymm0
+	LONG $0xdbefe1c5               // vpxor    xmm3, xmm3, xmm3
+	LONG $0xef3141c4; BYTE $0xc9   // vpxor    xmm9, xmm9, xmm9
+	LONG $0xef3941c4; BYTE $0xc0   // vpxor    xmm8, xmm8, xmm8
+	LONG $0xef0141c4; BYTE $0xff   // vpxor    xmm15, xmm15, xmm15
+	LONG $0xc0eff9c5               // vpxor    xmm0, xmm0, xmm0
+	JMP  LBB3_10
+
+LBB3_8:
+	LONG $0xfce28349               // and    r10, -4
+	WORD $0xf749; BYTE $0xda       // neg    r10
+	LONG $0xe4efd9c5               // vpxor    xmm4, xmm4, xmm4
+	LONG $0xc076fdc5               // vpcmpeqd    ymm0, ymm0, ymm0
+	LONG $0x447ffdc5; WORD $0x4024 // vmovdqa    yword [rsp + 64], ymm0
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0x597d62c4; WORD $0x0075 // vpbroadcastq    ymm14, qword 0[rbp] /* [rip + .LCPI3_0] */
+	LONG $0xc076fdc5               // vpcmpeqd    ymm0, ymm0, ymm0
+	LONG $0x447ffdc5; WORD $0x6024 // vmovdqa    yword [rsp + 96], ymm0
+	LONG $0xed76d5c5               // vpcmpeqd    ymm5, ymm5, ymm5
+	LONG $0xff76c5c5               // vpcmpeqd    ymm7, ymm7, ymm7
+	LONG $0x761d41c4; BYTE $0xe4   // vpcmpeqd    ymm12, ymm12, ymm12
+	LONG $0x762d41c4; BYTE $0xd2   // vpcmpeqd    ymm10, ymm10, ymm10
+	LONG $0x762541c4; BYTE $0xdb   // vpcmpeqd    ymm11, ymm11, ymm11
+	LONG $0x761541c4; BYTE $0xed   // vpcmpeqd    ymm13, ymm13, ymm13
+	LONG $0xc0eff9c5               // vpxor    xmm0, xmm0, xmm0
+	LONG $0x447ffdc5; WORD $0x2024 // vmovdqa    yword [rsp + 32], ymm0
+	LONG $0xc0eff9c5               // vpxor    xmm0, xmm0, xmm0
+	LONG $0x047ffdc5; BYTE $0x24   // vmovdqa    yword [rsp], ymm0
+	LONG $0xdbefe1c5               // vpxor    xmm3, xmm3, xmm3
+	LONG $0xef3141c4; BYTE $0xc9   // vpxor    xmm9, xmm9, xmm9
+	LONG $0xef3941c4; BYTE $0xc0   // vpxor    xmm8, xmm8, xmm8
+	LONG $0xef0141c4; BYTE $0xff   // vpxor    xmm15, xmm15, xmm15
+	LONG $0xc0eff9c5               // vpxor    xmm0, xmm0, xmm0
+
+LBB3_9:
+	QUAD $0x0000e0c78c6ffec5; BYTE $0x00 // vmovdqu    ymm1, yword [rdi + 8*rax + 224]
+	LONG $0xd1ef8dc5                     // vpxor    ymm2, ymm14, ymm1
+	LONG $0xf36ffdc5                     // vmovdqa    ymm6, ymm3
+	LONG $0xef15c1c4; BYTE $0xde         // vpxor    ymm3, ymm13, ymm14
+	LONG $0x376de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm2, ymm3
+	LONG $0x4b75c3c4; WORD $0x30dd       // vblendvpd    ymm3, ymm1, ymm13, ymm3
+	QUAD $0x000080249c29fdc5; BYTE $0x00 // vmovapd    yword [rsp + 128], ymm3
+	LONG $0xd8ef8dc5                     // vpxor    ymm3, ymm14, ymm0
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b75e3c4; WORD $0x20c0       // vblendvpd    ymm0, ymm1, ymm0, ymm2
+	QUAD $0x0000e0248429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 224], ymm0
+	QUAD $0x0000c0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 192]
+	LONG $0xc8ef8dc5                     // vpxor    ymm1, ymm14, ymm0
+	LONG $0xef25c1c4; BYTE $0xd6         // vpxor    ymm2, ymm11, ymm14
+	LONG $0x3775e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm1, ymm2
+	LONG $0x4b7dc3c4; WORD $0x20d3       // vblendvpd    ymm2, ymm0, ymm11, ymm2
+	QUAD $0x0000a0249429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 160], ymm2
+	LONG $0xef05c1c4; BYTE $0xd6         // vpxor    ymm2, ymm15, ymm14
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7dc3c4; WORD $0x10c7       // vblendvpd    ymm0, ymm0, ymm15, ymm1
+	QUAD $0x0000c0248429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 192], ymm0
+	QUAD $0x0000a0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 160]
+	LONG $0xc8ef8dc5                     // vpxor    ymm1, ymm14, ymm0
+	LONG $0xef2dc1c4; BYTE $0xd6         // vpxor    ymm2, ymm10, ymm14
+	LONG $0x3775e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm1, ymm2
+	LONG $0x6f7dc1c4; BYTE $0xd8         // vmovdqa    ymm3, ymm8
+	LONG $0x4b7d43c4; WORD $0x20c2       // vblendvpd    ymm8, ymm0, ymm10, ymm2
+	LONG $0xd3ef8dc5                     // vpxor    ymm2, ymm14, ymm3
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7d63c4; WORD $0x10eb       // vblendvpd    ymm13, ymm0, ymm3, ymm1
+	QUAD $0x000080c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 128]
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0xef1dc1c4; BYTE $0xce         // vpxor    ymm1, ymm12, ymm14
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7dc3c4; WORD $0x10cc       // vblendvpd    ymm1, ymm0, ymm12, ymm1
+	LONG $0xef35c1c4; BYTE $0xde         // vpxor    ymm3, ymm9, ymm14
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b7d43c4; WORD $0x20e1       // vblendvpd    ymm12, ymm0, ymm9, ymm2
+	LONG $0x546ffec5; WORD $0x60c7       // vmovdqu    ymm2, yword [rdi + 8*rax + 96]
+	LONG $0xc7ef8dc5                     // vpxor    ymm0, ymm14, ymm7
+	LONG $0xdaef8dc5                     // vpxor    ymm3, ymm14, ymm2
+	LONG $0x3765e2c4; BYTE $0xc0         // vpcmpgtq    ymm0, ymm3, ymm0
+	LONG $0x4b6de3c4; WORD $0x00c7       // vblendvpd    ymm0, ymm2, ymm7, ymm0
+	LONG $0xfc6f7dc5                     // vmovdqa    ymm15, ymm4
+	LONG $0xe6ef8dc5                     // vpxor    ymm4, ymm14, ymm6
+	LONG $0x375de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm4, ymm3
+	LONG $0x4b6d63c4; WORD $0x30d6       // vblendvpd    ymm10, ymm2, ymm6, ymm3
+	LONG $0x546ffec5; WORD $0x40c7       // vmovdqu    ymm2, yword [rdi + 8*rax + 64]
+	LONG $0xddef8dc5                     // vpxor    ymm3, ymm14, ymm5
+	LONG $0xe2ef8dc5                     // vpxor    ymm4, ymm14, ymm2
+	LONG $0x375de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm4, ymm3
+	LONG $0x4b6de3c4; WORD $0x30ed       // vblendvpd    ymm5, ymm2, ymm5, ymm3
+	LONG $0x346ffdc5; BYTE $0x24         // vmovdqa    ymm6, yword [rsp]
+	LONG $0xdeef8dc5                     // vpxor    ymm3, ymm14, ymm6
+	LONG $0x3765e2c4; BYTE $0xdc         // vpcmpgtq    ymm3, ymm3, ymm4
+	LONG $0x4b6d63c4; WORD $0x30ce       // vblendvpd    ymm9, ymm2, ymm6, ymm3
+	LONG $0x146ffec5; BYTE $0xc7         // vmovdqu    ymm2, yword [rdi + 8*rax]
+	LONG $0x7c6ffdc5; WORD $0x4024       // vmovdqa    ymm7, yword [rsp + 64]
+	LONG $0xdfef8dc5                     // vpxor    ymm3, ymm14, ymm7
+	LONG $0xe2ef8dc5                     // vpxor    ymm4, ymm14, ymm2
+	LONG $0x375de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm4, ymm3
+	LONG $0x4b6de3c4; WORD $0x30df       // vblendvpd    ymm3, ymm2, ymm7, ymm3
+	LONG $0xef0541c4; BYTE $0xde         // vpxor    ymm11, ymm15, ymm14
+	LONG $0x3725e2c4; BYTE $0xe4         // vpcmpgtq    ymm4, ymm11, ymm4
+	LONG $0x4b6dc3c4; WORD $0x40e7       // vblendvpd    ymm4, ymm2, ymm15, ymm4
+	LONG $0x546ffec5; WORD $0x20c7       // vmovdqu    ymm2, yword [rdi + 8*rax + 32]
+	LONG $0x7c6f7dc5; WORD $0x6024       // vmovdqa    ymm15, yword [rsp + 96]
+	LONG $0xef0541c4; BYTE $0xde         // vpxor    ymm11, ymm15, ymm14
+	LONG $0xfaef8dc5                     // vpxor    ymm7, ymm14, ymm2
+	LONG $0x374542c4; BYTE $0xdb         // vpcmpgtq    ymm11, ymm7, ymm11
+	LONG $0x4b6d43c4; WORD $0xb0df       // vblendvpd    ymm11, ymm2, ymm15, ymm11
+	LONG $0x746ffdc5; WORD $0x2024       // vmovdqa    ymm6, yword [rsp + 32]
+	LONG $0xfeef0dc5                     // vpxor    ymm15, ymm14, ymm6
+	LONG $0x3705e2c4; BYTE $0xff         // vpcmpgtq    ymm7, ymm15, ymm7
+	LONG $0x4b6de3c4; WORD $0x70d6       // vblendvpd    ymm2, ymm2, ymm6, ymm7
+	QUAD $0x000120c7b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 8*rax + 288]
+	LONG $0x5725c1c4; BYTE $0xfe         // vxorpd    ymm7, ymm11, ymm14
+	LONG $0xfeef0dc5                     // vpxor    ymm15, ymm14, ymm6
+	LONG $0x3705e2c4; BYTE $0xff         // vpcmpgtq    ymm7, ymm15, ymm7
+	LONG $0x4b4dc3c4; WORD $0x70fb       // vblendvpd    ymm7, ymm6, ymm11, ymm7
+	LONG $0x7c29fdc5; WORD $0x6024       // vmovapd    yword [rsp + 96], ymm7
+	LONG $0xfa578dc5                     // vxorpd    ymm7, ymm14, ymm2
+	LONG $0x3745c2c4; BYTE $0xff         // vpcmpgtq    ymm7, ymm7, ymm15
+	LONG $0x4b4de3c4; WORD $0x70d2       // vblendvpd    ymm2, ymm6, ymm2, ymm7
+	LONG $0x5429fdc5; WORD $0x4024       // vmovapd    yword [rsp + 64], ymm2
+	QUAD $0x000100c7b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 8*rax + 256]
+	LONG $0xfb578dc5                     // vxorpd    ymm7, ymm14, ymm3
+	LONG $0xdeef0dc5                     // vpxor    ymm11, ymm14, ymm6
+	LONG $0x3725e2c4; BYTE $0xff         // vpcmpgtq    ymm7, ymm11, ymm7
+	LONG $0x4b4de3c4; WORD $0x70d3       // vblendvpd    ymm2, ymm6, ymm3, ymm7
+	LONG $0x1429fdc5; BYTE $0x24         // vmovapd    yword [rsp], ymm2
+	LONG $0xfc578dc5                     // vxorpd    ymm7, ymm14, ymm4
+	LONG $0x3745c2c4; BYTE $0xfb         // vpcmpgtq    ymm7, ymm7, ymm11
+	LONG $0x4b4de3c4; WORD $0x70d4       // vblendvpd    ymm2, ymm6, ymm4, ymm7
+	LONG $0x5429fdc5; WORD $0x2024       // vmovapd    yword [rsp + 32], ymm2
+	QUAD $0x000140c7b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 8*rax + 320]
+	LONG $0xfd578dc5                     // vxorpd    ymm7, ymm14, ymm5
+	LONG $0xdeef0dc5                     // vpxor    ymm11, ymm14, ymm6
+	LONG $0x3725e2c4; BYTE $0xff         // vpcmpgtq    ymm7, ymm11, ymm7
+	LONG $0x4b4de3c4; WORD $0x70ed       // vblendvpd    ymm5, ymm6, ymm5, ymm7
+	LONG $0x5735c1c4; BYTE $0xfe         // vxorpd    ymm7, ymm9, ymm14
+	LONG $0x3745c2c4; BYTE $0xfb         // vpcmpgtq    ymm7, ymm7, ymm11
+	LONG $0x4b4dc3c4; WORD $0x70f9       // vblendvpd    ymm7, ymm6, ymm9, ymm7
+	QUAD $0x000160c7b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 8*rax + 352]
+	LONG $0xc8570dc5                     // vxorpd    ymm9, ymm14, ymm0
+	LONG $0xdeef0dc5                     // vpxor    ymm11, ymm14, ymm6
+	LONG $0x372542c4; BYTE $0xc9         // vpcmpgtq    ymm9, ymm11, ymm9
+	LONG $0x4b4d63c4; WORD $0x90c8       // vblendvpd    ymm9, ymm6, ymm0, ymm9
+	LONG $0x572dc1c4; BYTE $0xc6         // vxorpd    ymm0, ymm10, ymm14
+	LONG $0x377dc2c4; BYTE $0xc3         // vpcmpgtq    ymm0, ymm0, ymm11
+	LONG $0x4b4d43c4; WORD $0x00d2       // vblendvpd    ymm10, ymm6, ymm10, ymm0
+	QUAD $0x000180c7b46ffec5; BYTE $0x00 // vmovdqu    ymm6, yword [rdi + 8*rax + 384]
+	LONG $0xc1578dc5                     // vxorpd    ymm0, ymm14, ymm1
+	LONG $0xdeef0dc5                     // vpxor    ymm11, ymm14, ymm6
+	LONG $0x3725e2c4; BYTE $0xc0         // vpcmpgtq    ymm0, ymm11, ymm0
+	LONG $0x4b4de3c4; WORD $0x00e1       // vblendvpd    ymm4, ymm6, ymm1, ymm0
+	LONG $0x571dc1c4; BYTE $0xce         // vxorpd    ymm1, ymm12, ymm14
+	LONG $0x3775c2c4; BYTE $0xcb         // vpcmpgtq    ymm1, ymm1, ymm11
+	LONG $0x4b4dc3c4; WORD $0x10dc       // vblendvpd    ymm3, ymm6, ymm12, ymm1
+	QUAD $0x0001a0c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 416]
+	LONG $0x573dc1c4; BYTE $0xf6         // vxorpd    ymm6, ymm8, ymm14
+	LONG $0xef2541c4; BYTE $0xe6         // vpxor    ymm12, ymm11, ymm14
+	LONG $0x371de2c4; BYTE $0xf6         // vpcmpgtq    ymm6, ymm12, ymm6
+	LONG $0x4b25c3c4; WORD $0x60f0       // vblendvpd    ymm6, ymm11, ymm8, ymm6
+	LONG $0x571541c4; BYTE $0xc6         // vxorpd    ymm8, ymm13, ymm14
+	LONG $0x373d42c4; BYTE $0xc4         // vpcmpgtq    ymm8, ymm8, ymm12
+	LONG $0x4b2543c4; WORD $0x80e5       // vblendvpd    ymm12, ymm11, ymm13, ymm8
+	QUAD $0x0001c0c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 448]
+	QUAD $0x0000a024846ffdc5; BYTE $0x00 // vmovdqa    ymm0, yword [rsp + 160]
+	LONG $0xc0ef0dc5                     // vpxor    ymm8, ymm14, ymm0
+	LONG $0xef2541c4; BYTE $0xee         // vpxor    ymm13, ymm11, ymm14
+	LONG $0x371542c4; BYTE $0xc0         // vpcmpgtq    ymm8, ymm13, ymm8
+	LONG $0x4b2563c4; WORD $0x80c0       // vblendvpd    ymm8, ymm11, ymm0, ymm8
+	QUAD $0x0000c024846ffdc5; BYTE $0x00 // vmovdqa    ymm0, yword [rsp + 192]
+	LONG $0xf8ef0dc5                     // vpxor    ymm15, ymm14, ymm0
+	LONG $0x370542c4; BYTE $0xed         // vpcmpgtq    ymm13, ymm15, ymm13
+	LONG $0x4b2563c4; WORD $0xd0e8       // vblendvpd    ymm13, ymm11, ymm0, ymm13
+	QUAD $0x0001e0c79c6f7ec5; BYTE $0x00 // vmovdqu    ymm11, yword [rdi + 8*rax + 480]
+	QUAD $0x000080248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 128]
+	LONG $0xf9ef0dc5                     // vpxor    ymm15, ymm14, ymm1
+	LONG $0xef25c1c4; BYTE $0xc6         // vpxor    ymm0, ymm11, ymm14
+	LONG $0x377d42c4; BYTE $0xff         // vpcmpgtq    ymm15, ymm0, ymm15
+	LONG $0x4b25e3c4; WORD $0xf0c9       // vblendvpd    ymm1, ymm11, ymm1, ymm15
+	QUAD $0x0000e024946ffdc5; BYTE $0x00 // vmovdqa    ymm2, yword [rsp + 224]
+	LONG $0xfaef0dc5                     // vpxor    ymm15, ymm14, ymm2
+	LONG $0x3705e2c4; BYTE $0xc0         // vpcmpgtq    ymm0, ymm15, ymm0
+	LONG $0x4b2563c4; WORD $0x00fa       // vblendvpd    ymm15, ymm11, ymm2, ymm0
+	QUAD $0x0002e0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 736]
+	LONG $0xd9570dc5                     // vxorpd    ymm11, ymm14, ymm1
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376d42c4; BYTE $0xdb         // vpcmpgtq    ymm11, ymm2, ymm11
+	LONG $0x4b7de3c4; WORD $0xb0c9       // vblendvpd    ymm1, ymm0, ymm1, ymm11
+	QUAD $0x000080248c29fdc5; BYTE $0x00 // vmovapd    yword [rsp + 128], ymm1
+	LONG $0x5705c1c4; BYTE $0xce         // vxorpd    ymm1, ymm15, ymm14
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7dc3c4; WORD $0x10c7       // vblendvpd    ymm0, ymm0, ymm15, ymm1
+	QUAD $0x0000e0248429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 224], ymm0
+	QUAD $0x0002c0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 704]
+	LONG $0x573dc1c4; BYTE $0xce         // vxorpd    ymm1, ymm8, ymm14
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7dc3c4; WORD $0x10c8       // vblendvpd    ymm1, ymm0, ymm8, ymm1
+	QUAD $0x0000a0248c29fdc5; BYTE $0x00 // vmovapd    yword [rsp + 160], ymm1
+	LONG $0x5715c1c4; BYTE $0xce         // vxorpd    ymm1, ymm13, ymm14
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7dc3c4; WORD $0x10c5       // vblendvpd    ymm0, ymm0, ymm13, ymm1
+	QUAD $0x0000c0248429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 192], ymm0
+	QUAD $0x0002a0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 672]
+	LONG $0xce578dc5                     // vxorpd    ymm1, ymm14, ymm6
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7d63c4; WORD $0x10fe       // vblendvpd    ymm15, ymm0, ymm6, ymm1
+	LONG $0x571dc1c4; BYTE $0xce         // vxorpd    ymm1, ymm12, ymm14
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7d43c4; WORD $0x10ec       // vblendvpd    ymm13, ymm0, ymm12, ymm1
+	QUAD $0x000280c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 640]
+	LONG $0xcc578dc5                     // vxorpd    ymm1, ymm14, ymm4
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7d63c4; WORD $0x10e4       // vblendvpd    ymm12, ymm0, ymm4, ymm1
+	LONG $0xcb578dc5                     // vxorpd    ymm1, ymm14, ymm3
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7d63c4; WORD $0x10c3       // vblendvpd    ymm8, ymm0, ymm3, ymm1
+	QUAD $0x000260c7946ffec5; BYTE $0x00 // vmovdqu    ymm2, yword [rdi + 8*rax + 608]
+	LONG $0x5735c1c4; BYTE $0xce         // vxorpd    ymm1, ymm9, ymm14
+	LONG $0xdaef8dc5                     // vpxor    ymm3, ymm14, ymm2
+	LONG $0x3765e2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm3, ymm1
+	LONG $0x4b6dc3c4; WORD $0x10c9       // vblendvpd    ymm1, ymm2, ymm9, ymm1
+	LONG $0x572dc1c4; BYTE $0xe6         // vxorpd    ymm4, ymm10, ymm14
+	LONG $0x375de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm4, ymm3
+	LONG $0x4b6d43c4; WORD $0x30d2       // vblendvpd    ymm10, ymm2, ymm10, ymm3
+	QUAD $0x000240c7946ffec5; BYTE $0x00 // vmovdqu    ymm2, yword [rdi + 8*rax + 576]
+	LONG $0xdd578dc5                     // vxorpd    ymm3, ymm14, ymm5
+	LONG $0xe2ef8dc5                     // vpxor    ymm4, ymm14, ymm2
+	LONG $0x375de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm4, ymm3
+	LONG $0x4b6de3c4; WORD $0x30ed       // vblendvpd    ymm5, ymm2, ymm5, ymm3
+	LONG $0xdf578dc5                     // vxorpd    ymm3, ymm14, ymm7
+	LONG $0x3765e2c4; BYTE $0xdc         // vpcmpgtq    ymm3, ymm3, ymm4
+	LONG $0x4b6d63c4; WORD $0x30cf       // vblendvpd    ymm9, ymm2, ymm7, ymm3
+	QUAD $0x000200c7946ffec5; BYTE $0x00 // vmovdqu    ymm2, yword [rdi + 8*rax + 512]
+	LONG $0x046ffdc5; BYTE $0x24         // vmovdqa    ymm0, yword [rsp]
+	LONG $0xd8ef8dc5                     // vpxor    ymm3, ymm14, ymm0
+	LONG $0xe2ef8dc5                     // vpxor    ymm4, ymm14, ymm2
+	LONG $0x375de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm4, ymm3
+	LONG $0x4b6de3c4; WORD $0x30d8       // vblendvpd    ymm3, ymm2, ymm0, ymm3
+	LONG $0x446ffdc5; WORD $0x2024       // vmovdqa    ymm0, yword [rsp + 32]
+	LONG $0xf0ef8dc5                     // vpxor    ymm6, ymm14, ymm0
+	LONG $0x374de2c4; BYTE $0xe4         // vpcmpgtq    ymm4, ymm6, ymm4
+	LONG $0x4b6de3c4; WORD $0x40e0       // vblendvpd    ymm4, ymm2, ymm0, ymm4
+	QUAD $0x000220c7946ffec5; BYTE $0x00 // vmovdqu    ymm2, yword [rdi + 8*rax + 544]
+	LONG $0x446ffdc5; WORD $0x6024       // vmovdqa    ymm0, yword [rsp + 96]
+	LONG $0xf0ef8dc5                     // vpxor    ymm6, ymm14, ymm0
+	LONG $0xfaef8dc5                     // vpxor    ymm7, ymm14, ymm2
+	LONG $0x3745e2c4; BYTE $0xf6         // vpcmpgtq    ymm6, ymm7, ymm6
+	LONG $0x4b6de3c4; WORD $0x60f0       // vblendvpd    ymm6, ymm2, ymm0, ymm6
+	LONG $0x446ffdc5; WORD $0x4024       // vmovdqa    ymm0, yword [rsp + 64]
+	LONG $0xd8ef0dc5                     // vpxor    ymm11, ymm14, ymm0
+	LONG $0x3725e2c4; BYTE $0xff         // vpcmpgtq    ymm7, ymm11, ymm7
+	LONG $0x4b6de3c4; WORD $0x70d0       // vblendvpd    ymm2, ymm2, ymm0, ymm7
+	QUAD $0x000320c7bc6ffec5; BYTE $0x00 // vmovdqu    ymm7, yword [rdi + 8*rax + 800]
+	LONG $0xde570dc5                     // vxorpd    ymm11, ymm14, ymm6
+	LONG $0xc7ef8dc5                     // vpxor    ymm0, ymm14, ymm7
+	LONG $0x377d42c4; BYTE $0xdb         // vpcmpgtq    ymm11, ymm0, ymm11
+	LONG $0x4b45e3c4; WORD $0xb0f6       // vblendvpd    ymm6, ymm7, ymm6, ymm11
+	LONG $0x7429fdc5; WORD $0x6024       // vmovapd    yword [rsp + 96], ymm6
+	LONG $0xf2578dc5                     // vxorpd    ymm6, ymm14, ymm2
+	LONG $0x374de2c4; BYTE $0xc0         // vpcmpgtq    ymm0, ymm6, ymm0
+	LONG $0x4b45e3c4; WORD $0x00c2       // vblendvpd    ymm0, ymm7, ymm2, ymm0
+	LONG $0x4429fdc5; WORD $0x2024       // vmovapd    yword [rsp + 32], ymm0
+	QUAD $0x000300c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 768]
+	LONG $0xd3578dc5                     // vxorpd    ymm2, ymm14, ymm3
+	LONG $0xf8ef8dc5                     // vpxor    ymm7, ymm14, ymm0
+	LONG $0x3745e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm7, ymm2
+	LONG $0x4b7de3c4; WORD $0x20d3       // vblendvpd    ymm2, ymm0, ymm3, ymm2
+	LONG $0x5429fdc5; WORD $0x4024       // vmovapd    yword [rsp + 64], ymm2
+	LONG $0xd4578dc5                     // vxorpd    ymm2, ymm14, ymm4
+	LONG $0x376de2c4; BYTE $0xd7         // vpcmpgtq    ymm2, ymm2, ymm7
+	LONG $0x4b7de3c4; WORD $0x20e4       // vblendvpd    ymm4, ymm0, ymm4, ymm2
+	QUAD $0x000340c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 832]
+	LONG $0xd5578dc5                     // vxorpd    ymm2, ymm14, ymm5
+	LONG $0xd8ef8dc5                     // vpxor    ymm3, ymm14, ymm0
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b7de3c4; WORD $0x20ed       // vblendvpd    ymm5, ymm0, ymm5, ymm2
+	LONG $0x5735c1c4; BYTE $0xd6         // vxorpd    ymm2, ymm9, ymm14
+	LONG $0x376de2c4; BYTE $0xd3         // vpcmpgtq    ymm2, ymm2, ymm3
+	LONG $0x4b7dc3c4; WORD $0x20c1       // vblendvpd    ymm0, ymm0, ymm9, ymm2
+	LONG $0x0429fdc5; BYTE $0x24         // vmovapd    yword [rsp], ymm0
+	QUAD $0x000360c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 864]
+	LONG $0xd1578dc5                     // vxorpd    ymm2, ymm14, ymm1
+	LONG $0xd8ef8dc5                     // vpxor    ymm3, ymm14, ymm0
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b7de3c4; WORD $0x20f9       // vblendvpd    ymm7, ymm0, ymm1, ymm2
+	LONG $0x572dc1c4; BYTE $0xce         // vxorpd    ymm1, ymm10, ymm14
+	LONG $0x3775e2c4; BYTE $0xcb         // vpcmpgtq    ymm1, ymm1, ymm3
+	LONG $0x4b7dc3c4; WORD $0x10da       // vblendvpd    ymm3, ymm0, ymm10, ymm1
+	QUAD $0x000380c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 896]
+	LONG $0x571dc1c4; BYTE $0xce         // vxorpd    ymm1, ymm12, ymm14
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7d43c4; WORD $0x10e4       // vblendvpd    ymm12, ymm0, ymm12, ymm1
+	LONG $0x573dc1c4; BYTE $0xce         // vxorpd    ymm1, ymm8, ymm14
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7d43c4; WORD $0x10c8       // vblendvpd    ymm9, ymm0, ymm8, ymm1
+	QUAD $0x0003a0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 928]
+	LONG $0x5705c1c4; BYTE $0xce         // vxorpd    ymm1, ymm15, ymm14
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7d43c4; WORD $0x10d7       // vblendvpd    ymm10, ymm0, ymm15, ymm1
+	LONG $0x5715c1c4; BYTE $0xce         // vxorpd    ymm1, ymm13, ymm14
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7d43c4; WORD $0x10c5       // vblendvpd    ymm8, ymm0, ymm13, ymm1
+	QUAD $0x0003c0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 960]
+	QUAD $0x0000a024b46ffdc5; BYTE $0x00 // vmovdqa    ymm6, yword [rsp + 160]
+	LONG $0xceef8dc5                     // vpxor    ymm1, ymm14, ymm6
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7d63c4; WORD $0x10de       // vblendvpd    ymm11, ymm0, ymm6, ymm1
+	QUAD $0x0000c024b46ffdc5; BYTE $0x00 // vmovdqa    ymm6, yword [rsp + 192]
+	LONG $0xceef8dc5                     // vpxor    ymm1, ymm14, ymm6
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7d63c4; WORD $0x10fe       // vblendvpd    ymm15, ymm0, ymm6, ymm1
+	QUAD $0x0003e0c7846ffec5; BYTE $0x00 // vmovdqu    ymm0, yword [rdi + 8*rax + 992]
+	QUAD $0x00008024b46ffdc5; BYTE $0x00 // vmovdqa    ymm6, yword [rsp + 128]
+	LONG $0xceef8dc5                     // vpxor    ymm1, ymm14, ymm6
+	LONG $0xd0ef8dc5                     // vpxor    ymm2, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b7d63c4; WORD $0x10ee       // vblendvpd    ymm13, ymm0, ymm6, ymm1
+	QUAD $0x0000e024b46ffdc5; BYTE $0x00 // vmovdqa    ymm6, yword [rsp + 224]
+	LONG $0xceef8dc5                     // vpxor    ymm1, ymm14, ymm6
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b7de3c4; WORD $0x10c6       // vblendvpd    ymm0, ymm0, ymm6, ymm1
+	LONG $0x80e88348                     // sub    rax, -128
+	LONG $0x04c28349                     // add    r10, 4
+	JNE  LBB3_9
+
+LBB3_10:
+	QUAD $0x0000802494297cc5; BYTE $0x00 // vmovaps    yword [rsp + 128], ymm10
+	WORD $0x854d; BYTE $0xc9             // test    r9, r9
+	LONG $0x6f7d41c4; BYTE $0xd4         // vmovdqa    ymm10, ymm12
+	LONG $0xe36f7dc5                     // vmovdqa    ymm12, ymm3
+	JE   LBB3_13
+	LONG $0xc7048d48                     // lea    rax, [rdi + 8*rax]
+	WORD $0xf749; BYTE $0xd9             // neg    r9
+	LONG $0x597d62c4; WORD $0x0075       // vpbroadcastq    ymm14, qword 0[rbp] /* [rip + .LCPI3_0] */
+
+LBB3_12:
+	LONG $0x486ffec5; BYTE $0x20         // vmovdqu    ymm1, yword [rax + 32]
+	LONG $0xf76ffdc5                     // vmovdqa    ymm6, ymm7
+	LONG $0xfd6ffdc5                     // vmovdqa    ymm7, ymm5
+	LONG $0xec6ffdc5                     // vmovdqa    ymm5, ymm4
+	LONG $0x646ffdc5; WORD $0x6024       // vmovdqa    ymm4, yword [rsp + 96]
+	LONG $0xd4ef8dc5                     // vpxor    ymm2, ymm14, ymm4
+	LONG $0xd9ef8dc5                     // vpxor    ymm3, ymm14, ymm1
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b75e3c4; WORD $0x20e4       // vblendvpd    ymm4, ymm1, ymm4, ymm2
+	LONG $0x6429fdc5; WORD $0x6024       // vmovapd    yword [rsp + 96], ymm4
+	LONG $0x646ffdc5; WORD $0x2024       // vmovdqa    ymm4, yword [rsp + 32]
+	LONG $0xd4ef8dc5                     // vpxor    ymm2, ymm14, ymm4
+	LONG $0x376de2c4; BYTE $0xd3         // vpcmpgtq    ymm2, ymm2, ymm3
+	LONG $0x4b75e3c4; WORD $0x20e4       // vblendvpd    ymm4, ymm1, ymm4, ymm2
+	LONG $0x6429fdc5; WORD $0x2024       // vmovapd    yword [rsp + 32], ymm4
+	LONG $0x086ffec5                     // vmovdqu    ymm1, yword [rax]
+	LONG $0x646ffdc5; WORD $0x4024       // vmovdqa    ymm4, yword [rsp + 64]
+	LONG $0xd4ef8dc5                     // vpxor    ymm2, ymm14, ymm4
+	LONG $0xd9ef8dc5                     // vpxor    ymm3, ymm14, ymm1
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b75e3c4; WORD $0x20e4       // vblendvpd    ymm4, ymm1, ymm4, ymm2
+	LONG $0x6429fdc5; WORD $0x4024       // vmovapd    yword [rsp + 64], ymm4
+	LONG $0xe56ffdc5                     // vmovdqa    ymm4, ymm5
+	LONG $0xef6ffdc5                     // vmovdqa    ymm5, ymm7
+	LONG $0xfe6ffdc5                     // vmovdqa    ymm7, ymm6
+	LONG $0xd4ef8dc5                     // vpxor    ymm2, ymm14, ymm4
+	LONG $0x376de2c4; BYTE $0xd3         // vpcmpgtq    ymm2, ymm2, ymm3
+	LONG $0x586ffec5; BYTE $0x40         // vmovdqu    ymm3, yword [rax + 64]
+	LONG $0x4b75e3c4; WORD $0x20e4       // vblendvpd    ymm4, ymm1, ymm4, ymm2
+	LONG $0xcbef8dc5                     // vpxor    ymm1, ymm14, ymm3
+	LONG $0xd5ef8dc5                     // vpxor    ymm2, ymm14, ymm5
+	LONG $0x3775e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm1, ymm2
+	LONG $0x4b65e3c4; WORD $0x20ed       // vblendvpd    ymm5, ymm3, ymm5, ymm2
+	LONG $0x346ffdc5; BYTE $0x24         // vmovdqa    ymm6, yword [rsp]
+	LONG $0xd6ef8dc5                     // vpxor    ymm2, ymm14, ymm6
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b65e3c4; WORD $0x10f6       // vblendvpd    ymm6, ymm3, ymm6, ymm1
+	LONG $0x3429fdc5; BYTE $0x24         // vmovapd    yword [rsp], ymm6
+	LONG $0x486ffec5; BYTE $0x60         // vmovdqu    ymm1, yword [rax + 96]
+	LONG $0xd1ef8dc5                     // vpxor    ymm2, ymm14, ymm1
+	LONG $0xdfef8dc5                     // vpxor    ymm3, ymm14, ymm7
+	LONG $0x376de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm2, ymm3
+	LONG $0x4b75e3c4; WORD $0x30ff       // vblendvpd    ymm7, ymm1, ymm7, ymm3
+	LONG $0xef1dc1c4; BYTE $0xde         // vpxor    ymm3, ymm12, ymm14
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	QUAD $0x00000080986ffec5             // vmovdqu    ymm3, yword [rax + 128]
+	LONG $0x4b7543c4; WORD $0x20e4       // vblendvpd    ymm12, ymm1, ymm12, ymm2
+	LONG $0xcbef8dc5                     // vpxor    ymm1, ymm14, ymm3
+	LONG $0xef2dc1c4; BYTE $0xd6         // vpxor    ymm2, ymm10, ymm14
+	LONG $0x3775e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm1, ymm2
+	LONG $0x4b6543c4; WORD $0x20d2       // vblendvpd    ymm10, ymm3, ymm10, ymm2
+	LONG $0xef35c1c4; BYTE $0xd6         // vpxor    ymm2, ymm9, ymm14
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b6543c4; WORD $0x10c9       // vblendvpd    ymm9, ymm3, ymm9, ymm1
+	QUAD $0x000000a0886ffec5             // vmovdqu    ymm1, yword [rax + 160]
+	LONG $0xd1ef8dc5                     // vpxor    ymm2, ymm14, ymm1
+	QUAD $0x00008024b46ffdc5; BYTE $0x00 // vmovdqa    ymm6, yword [rsp + 128]
+	LONG $0xdeef8dc5                     // vpxor    ymm3, ymm14, ymm6
+	LONG $0x376de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm2, ymm3
+	LONG $0x4b75e3c4; WORD $0x30f6       // vblendvpd    ymm6, ymm1, ymm6, ymm3
+	QUAD $0x00008024b429fdc5; BYTE $0x00 // vmovapd    yword [rsp + 128], ymm6
+	LONG $0xef3dc1c4; BYTE $0xde         // vpxor    ymm3, ymm8, ymm14
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	QUAD $0x000000c0986ffec5             // vmovdqu    ymm3, yword [rax + 192]
+	LONG $0x4b7543c4; WORD $0x20c0       // vblendvpd    ymm8, ymm1, ymm8, ymm2
+	LONG $0xcbef8dc5                     // vpxor    ymm1, ymm14, ymm3
+	LONG $0xef25c1c4; BYTE $0xd6         // vpxor    ymm2, ymm11, ymm14
+	LONG $0x3775e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm1, ymm2
+	LONG $0x4b6543c4; WORD $0x20db       // vblendvpd    ymm11, ymm3, ymm11, ymm2
+	LONG $0xef05c1c4; BYTE $0xd6         // vpxor    ymm2, ymm15, ymm14
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b6543c4; WORD $0x10ff       // vblendvpd    ymm15, ymm3, ymm15, ymm1
+	QUAD $0x000000e0886ffec5             // vmovdqu    ymm1, yword [rax + 224]
+	LONG $0xd1ef8dc5                     // vpxor    ymm2, ymm14, ymm1
+	LONG $0xef15c1c4; BYTE $0xde         // vpxor    ymm3, ymm13, ymm14
+	LONG $0x376de2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm2, ymm3
+	LONG $0x4b7543c4; WORD $0x30ed       // vblendvpd    ymm13, ymm1, ymm13, ymm3
+	LONG $0xd8ef8dc5                     // vpxor    ymm3, ymm14, ymm0
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b75e3c4; WORD $0x20c0       // vblendvpd    ymm0, ymm1, ymm0, ymm2
+	LONG $0x01000548; WORD $0x0000       // add    rax, 256
+	WORD $0xff49; BYTE $0xc1             // inc    r9
+	JNE  LBB3_12
+
+LBB3_13:
+	LONG $0x597d62c4; WORD $0x0075       // vpbroadcastq    ymm14, qword 0[rbp] /* [rip + .LCPI3_0] */
+	LONG $0x1c6ffdc5; BYTE $0x24         // vmovdqa    ymm3, yword [rsp]
+	LONG $0xcbef8dc5                     // vpxor    ymm1, ymm14, ymm3
+	LONG $0xef05c1c4; BYTE $0xd6         // vpxor    ymm2, ymm15, ymm14
+	LONG $0x3775e2c4; BYTE $0xca         // vpcmpgtq    ymm1, ymm1, ymm2
+	LONG $0x4b05e3c4; WORD $0x10cb       // vblendvpd    ymm1, ymm15, ymm3, ymm1
+	LONG $0xd4ef8dc5                     // vpxor    ymm2, ymm14, ymm4
+	LONG $0xef35c1c4; BYTE $0xde         // vpxor    ymm3, ymm9, ymm14
+	LONG $0x376de2c4; BYTE $0xd3         // vpcmpgtq    ymm2, ymm2, ymm3
+	LONG $0x4b35e3c4; WORD $0x20d4       // vblendvpd    ymm2, ymm9, ymm4, ymm2
+	LONG $0xef1dc1c4; BYTE $0xde         // vpxor    ymm3, ymm12, ymm14
+	LONG $0xc8ef0dc5                     // vpxor    ymm9, ymm14, ymm0
+	LONG $0x3765c2c4; BYTE $0xd9         // vpcmpgtq    ymm3, ymm3, ymm9
+	LONG $0x4b7dc3c4; WORD $0x30c4       // vblendvpd    ymm0, ymm0, ymm12, ymm3
+	LONG $0x646ffdc5; WORD $0x2024       // vmovdqa    ymm4, yword [rsp + 32]
+	LONG $0xdcef8dc5                     // vpxor    ymm3, ymm14, ymm4
+	LONG $0xef3d41c4; BYTE $0xce         // vpxor    ymm9, ymm8, ymm14
+	LONG $0x3765c2c4; BYTE $0xd9         // vpcmpgtq    ymm3, ymm3, ymm9
+	LONG $0x4b3de3c4; WORD $0x30dc       // vblendvpd    ymm3, ymm8, ymm4, ymm3
+	LONG $0xf3578dc5                     // vxorpd    ymm6, ymm14, ymm3
+	LONG $0xc8570dc5                     // vxorpd    ymm9, ymm14, ymm0
+	LONG $0x374dc2c4; BYTE $0xf1         // vpcmpgtq    ymm6, ymm6, ymm9
+	LONG $0x4b7de3c4; WORD $0x60c3       // vblendvpd    ymm0, ymm0, ymm3, ymm6
+	LONG $0xda578dc5                     // vxorpd    ymm3, ymm14, ymm2
+	LONG $0xf1578dc5                     // vxorpd    ymm6, ymm14, ymm1
+	LONG $0x3765e2c4; BYTE $0xde         // vpcmpgtq    ymm3, ymm3, ymm6
+	LONG $0x4b75e3c4; WORD $0x30ca       // vblendvpd    ymm1, ymm1, ymm2, ymm3
+	LONG $0xd1578dc5                     // vxorpd    ymm2, ymm14, ymm1
+	LONG $0xd8578dc5                     // vxorpd    ymm3, ymm14, ymm0
+	LONG $0x376de2c4; BYTE $0xd3         // vpcmpgtq    ymm2, ymm2, ymm3
+	LONG $0x4b7de3c4; WORD $0x20c1       // vblendvpd    ymm0, ymm0, ymm1, ymm2
+	LONG $0x197de3c4; WORD $0x01c1       // vextractf128    xmm1, ymm0, 1
+	LONG $0xd15789c5                     // vxorpd    xmm2, xmm14, xmm1
+	LONG $0xd85789c5                     // vxorpd    xmm3, xmm14, xmm0
+	LONG $0x3761e2c4; BYTE $0xd2         // vpcmpgtq    xmm2, xmm3, xmm2
+	LONG $0x4b71e3c4; WORD $0x20c0       // vblendvpd    xmm0, xmm1, xmm0, xmm2
+	LONG $0x0479e3c4; WORD $0x4ec8       // vpermilps    xmm1, xmm0, 78
+	LONG $0xd05789c5                     // vxorpd    xmm2, xmm14, xmm0
+	LONG $0xd95789c5                     // vxorpd    xmm3, xmm14, xmm1
+	LONG $0x3769e2c4; BYTE $0xd3         // vpcmpgtq    xmm2, xmm2, xmm3
+	LONG $0x4b71e3c4; WORD $0x20c0       // vblendvpd    xmm0, xmm1, xmm0, xmm2
+	LONG $0xcdef8dc5                     // vpxor    ymm1, ymm14, ymm5
+	LONG $0xef25c1c4; BYTE $0xd6         // vpxor    ymm2, ymm11, ymm14
+	LONG $0x376de2c4; BYTE $0xc9         // vpcmpgtq    ymm1, ymm2, ymm1
+	LONG $0x4b25e3c4; WORD $0x10cd       // vblendvpd    ymm1, ymm11, ymm5, ymm1
+	LONG $0x646ffdc5; WORD $0x4024       // vmovdqa    ymm4, yword [rsp + 64]
+	LONG $0xd4ef8dc5                     // vpxor    ymm2, ymm14, ymm4
+	LONG $0xef2dc1c4; BYTE $0xde         // vpxor    ymm3, ymm10, ymm14
+	LONG $0x3765e2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm3, ymm2
+	LONG $0x4b2de3c4; WORD $0x20d4       // vblendvpd    ymm2, ymm10, ymm4, ymm2
+	LONG $0xdfef8dc5                     // vpxor    ymm3, ymm14, ymm7
+	LONG $0xef15c1c4; BYTE $0xee         // vpxor    ymm5, ymm13, ymm14
+	LONG $0x3755e2c4; BYTE $0xdb         // vpcmpgtq    ymm3, ymm5, ymm3
+	LONG $0x4b15e3c4; WORD $0x30df       // vblendvpd    ymm3, ymm13, ymm7, ymm3
+	LONG $0x746ffdc5; WORD $0x6024       // vmovdqa    ymm6, yword [rsp + 96]
+	LONG $0xe6ef8dc5                     // vpxor    ymm4, ymm14, ymm6
+	QUAD $0x00008024bc6ffdc5; BYTE $0x00 // vmovdqa    ymm7, yword [rsp + 128]
+	LONG $0xefef8dc5                     // vpxor    ymm5, ymm14, ymm7
+	LONG $0x3755e2c4; BYTE $0xe4         // vpcmpgtq    ymm4, ymm5, ymm4
+	LONG $0x4b45e3c4; WORD $0x40e6       // vblendvpd    ymm4, ymm7, ymm6, ymm4
+	LONG $0xec578dc5                     // vxorpd    ymm5, ymm14, ymm4
+	LONG $0xf3578dc5                     // vxorpd    ymm6, ymm14, ymm3
+	LONG $0x374de2c4; BYTE $0xed         // vpcmpgtq    ymm5, ymm6, ymm5
+	LONG $0x4b65e3c4; WORD $0x50dc       // vblendvpd    ymm3, ymm3, ymm4, ymm5
+	LONG $0xe2578dc5                     // vxorpd    ymm4, ymm14, ymm2
+	LONG $0xe9578dc5                     // vxorpd    ymm5, ymm14, ymm1
+	LONG $0x3755e2c4; BYTE $0xe4         // vpcmpgtq    ymm4, ymm5, ymm4
+	LONG $0x4b75e3c4; WORD $0x40ca       // vblendvpd    ymm1, ymm1, ymm2, ymm4
+	LONG $0xd1578dc5                     // vxorpd    ymm2, ymm14, ymm1
+	LONG $0xe3578dc5                     // vxorpd    ymm4, ymm14, ymm3
+	LONG $0x375de2c4; BYTE $0xd2         // vpcmpgtq    ymm2, ymm4, ymm2
+	LONG $0x4b65e3c4; WORD $0x20c9       // vblendvpd    ymm1, ymm3, ymm1, ymm2
+	LONG $0x197de3c4; WORD $0x01ca       // vextractf128    xmm2, ymm1, 1
+	LONG $0xd95789c5                     // vxorpd    xmm3, xmm14, xmm1
+	LONG $0xe25789c5                     // vxorpd    xmm4, xmm14, xmm2
+	LONG $0x3759e2c4; BYTE $0xdb         // vpcmpgtq    xmm3, xmm4, xmm3
+	LONG $0x4b69e3c4; WORD $0x30c9       // vblendvpd    xmm1, xmm2, xmm1, xmm3
+	LONG $0x0479e3c4; WORD $0x4ed1       // vpermilps    xmm2, xmm1, 78
+	LONG $0xd95789c5                     // vxorpd    xmm3, xmm14, xmm1
+	LONG $0xe25789c5                     // vxorpd    xmm4, xmm14, xmm2
+	LONG $0x3759e2c4; BYTE $0xdb         // vpcmpgtq    xmm3, xmm4, xmm3
+	LONG $0x4b69e3c4; WORD $0x30c9       // vblendvpd    xmm1, xmm2, xmm1, xmm3
+	LONG $0x7ef9e1c4; BYTE $0xc6         // vmovq    rsi, xmm0
+	LONG $0x7ef9c1c4; BYTE $0xc9         // vmovq    r9, xmm1
+	WORD $0x394d; BYTE $0xc3             // cmp    r11, r8
+	JE   LBB3_14
+
+LBB3_4:
+	WORD $0x8948; BYTE $0xf0 // mov    rax, rsi
+
+LBB3_5:
+	LONG $0xdf348b4a         // mov    rsi, qword [rdi + 8*r11]
+	WORD $0x3949; BYTE $0xf1 // cmp    r9, rsi
+	LONG $0xce430f4c         // cmovae    r9, rsi
+	WORD $0x3948; BYTE $0xf0 // cmp    rax, rsi
+	LONG $0xf0470f48         // cmova    rsi, rax
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0x8948; BYTE $0xf0 // mov    rax, rsi
+	WORD $0x394d; BYTE $0xd8 // cmp    r8, r11
+	JNE  LBB3_5
+
+LBB3_14:
+	WORD $0x8948; BYTE $0x31 // mov    qword [rcx], rsi
+	WORD $0x894c; BYTE $0x0a // mov    qword [rdx], r9
+	SUBQ $8, SP
+	VZEROUPPER
+	RET
diff --git a/go/parquet/internal/utils/min_max_noasm.go b/go/parquet/internal/utils/min_max_noasm.go
new file mode 100644
index 00000000000..1ef1adc6fd7
--- /dev/null
+++ b/go/parquet/internal/utils/min_max_noasm.go
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build noasm
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+func init() {
+	minmaxFuncs.i32 = int32MinMax
+	minmaxFuncs.ui32 = uint32MinMax
+	minmaxFuncs.i64 = int64MinMax
+	minmaxFuncs.ui64 = uint64MinMax
+}
diff --git a/go/parquet/internal/utils/min_max_sse4.go b/go/parquet/internal/utils/min_max_sse4.go
new file mode 100644
index 00000000000..22b92fbc2b1
--- /dev/null
+++ b/go/parquet/internal/utils/min_max_sse4.go
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import "unsafe"
+
+// This file contains convenience functions for utilizing SSE4 intrinsics to quickly
+// and efficiently get the min and max from an integral slice.
+
+//go:noescape
+func _int32_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int32MaxMinSSE4(values []int32) (min, max int32) {
+	_int32_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
+
+//go:noescape
+func _uint32_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint32MaxMinSSE4(values []uint32) (min, max uint32) {
+	_uint32_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
+
+//go:noescape
+func _int64_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int64MaxMinSSE4(values []int64) (min, max int64) {
+	_int64_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
+
+//go:noescape
+func _uint64_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint64MaxMinSSE4(values []uint64) (min, max uint64) {
+	_uint64_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+	return
+}
diff --git a/go/parquet/internal/utils/min_max_sse4.s b/go/parquet/internal/utils/min_max_sse4.s
new file mode 100644
index 00000000000..129b512d0e7
--- /dev/null
+++ b/go/parquet/internal/utils/min_max_sse4.s
@@ -0,0 +1,592 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+DATA LCDATA1<>+0x000(SB)/8, $0x8000000080000000
+DATA LCDATA1<>+0x008(SB)/8, $0x8000000080000000
+DATA LCDATA1<>+0x010(SB)/8, $0x7fffffff7fffffff
+DATA LCDATA1<>+0x018(SB)/8, $0x7fffffff7fffffff
+GLOBL LCDATA1<>(SB), 8, $32
+
+TEXT ·_int32_max_min_sse4(SB), $0-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+	LEAQ LCDATA1<>(SB), BP
+
+	WORD $0xf685                   // test    esi, esi
+	JLE  LBB0_1
+	WORD $0x8941; BYTE $0xf1       // mov    r9d, esi
+	WORD $0xfe83; BYTE $0x07       // cmp    esi, 7
+	JA   LBB0_6
+	LONG $0x000000b8; BYTE $0x80   // mov    eax, -2147483648
+	LONG $0xffffb841; WORD $0x7fff // mov    r8d, 2147483647
+	WORD $0x3145; BYTE $0xdb       // xor    r11d, r11d
+	JMP  LBB0_4
+
+LBB0_1:
+	LONG $0xffffb841; WORD $0x7fff // mov    r8d, 2147483647
+	LONG $0x000000b8; BYTE $0x80   // mov    eax, -2147483648
+	JMP  LBB0_13
+
+LBB0_6:
+	WORD $0x8945; BYTE $0xcb     // mov    r11d, r9d
+	LONG $0xf8e38341             // and    r11d, -8
+	LONG $0xf8438d49             // lea    rax, [r11 - 8]
+	WORD $0x8949; BYTE $0xc0     // mov    r8, rax
+	LONG $0x03e8c149             // shr    r8, 3
+	LONG $0x01c08349             // add    r8, 1
+	WORD $0x8548; BYTE $0xc0     // test    rax, rax
+	JE   LBB0_7
+	WORD $0x894d; BYTE $0xc2     // mov    r10, r8
+	LONG $0xfee28349             // and    r10, -2
+	WORD $0xf749; BYTE $0xda     // neg    r10
+	LONG $0x4d6f0f66; BYTE $0x00 // movdqa    xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */
+	LONG $0x456f0f66; BYTE $0x10 // movdqa    xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */
+	WORD $0xc031                 // xor    eax, eax
+	LONG $0xd06f0f66             // movdqa    xmm2, xmm0
+	LONG $0xd96f0f66             // movdqa    xmm3, xmm1
+
+LBB0_9:
+	LONG $0x246f0ff3; BYTE $0x87   // movdqu    xmm4, oword [rdi + 4*rax]
+	LONG $0x6c6f0ff3; WORD $0x1087 // movdqu    xmm5, oword [rdi + 4*rax + 16]
+	LONG $0x746f0ff3; WORD $0x2087 // movdqu    xmm6, oword [rdi + 4*rax + 32]
+	LONG $0x7c6f0ff3; WORD $0x3087 // movdqu    xmm7, oword [rdi + 4*rax + 48]
+	LONG $0x39380f66; BYTE $0xc4   // pminsd    xmm0, xmm4
+	LONG $0x39380f66; BYTE $0xd5   // pminsd    xmm2, xmm5
+	LONG $0x3d380f66; BYTE $0xcc   // pmaxsd    xmm1, xmm4
+	LONG $0x3d380f66; BYTE $0xdd   // pmaxsd    xmm3, xmm5
+	LONG $0x39380f66; BYTE $0xc6   // pminsd    xmm0, xmm6
+	LONG $0x39380f66; BYTE $0xd7   // pminsd    xmm2, xmm7
+	LONG $0x3d380f66; BYTE $0xce   // pmaxsd    xmm1, xmm6
+	LONG $0x3d380f66; BYTE $0xdf   // pmaxsd    xmm3, xmm7
+	LONG $0x10c08348               // add    rax, 16
+	LONG $0x02c28349               // add    r10, 2
+	JNE  LBB0_9
+	LONG $0x01c0f641               // test    r8b, 1
+	JE   LBB0_12
+
+LBB0_11:
+	LONG $0x246f0ff3; BYTE $0x87   // movdqu    xmm4, oword [rdi + 4*rax]
+	LONG $0x6c6f0ff3; WORD $0x1087 // movdqu    xmm5, oword [rdi + 4*rax + 16]
+	LONG $0x3d380f66; BYTE $0xdd   // pmaxsd    xmm3, xmm5
+	LONG $0x3d380f66; BYTE $0xcc   // pmaxsd    xmm1, xmm4
+	LONG $0x39380f66; BYTE $0xd5   // pminsd    xmm2, xmm5
+	LONG $0x39380f66; BYTE $0xc4   // pminsd    xmm0, xmm4
+
+LBB0_12:
+	LONG $0x39380f66; BYTE $0xc2 // pminsd    xmm0, xmm2
+	LONG $0x3d380f66; BYTE $0xcb // pmaxsd    xmm1, xmm3
+	LONG $0xd1700f66; BYTE $0x4e // pshufd    xmm2, xmm1, 78
+	LONG $0x3d380f66; BYTE $0xd1 // pmaxsd    xmm2, xmm1
+	LONG $0xca700f66; BYTE $0xe5 // pshufd    xmm1, xmm2, 229
+	LONG $0x3d380f66; BYTE $0xca // pmaxsd    xmm1, xmm2
+	LONG $0xc87e0f66             // movd    eax, xmm1
+	LONG $0xc8700f66; BYTE $0x4e // pshufd    xmm1, xmm0, 78
+	LONG $0x39380f66; BYTE $0xc8 // pminsd    xmm1, xmm0
+	LONG $0xc1700f66; BYTE $0xe5 // pshufd    xmm0, xmm1, 229
+	LONG $0x39380f66; BYTE $0xc1 // pminsd    xmm0, xmm1
+	LONG $0x7e0f4166; BYTE $0xc0 // movd    r8d, xmm0
+	WORD $0x394d; BYTE $0xcb     // cmp    r11, r9
+	JE   LBB0_13
+
+LBB0_4:
+	WORD $0xc689 // mov    esi, eax
+
+LBB0_5:
+	LONG $0x9f048b42         // mov    eax, dword [rdi + 4*r11]
+	WORD $0x3941; BYTE $0xc0 // cmp    r8d, eax
+	LONG $0xc04f0f44         // cmovg    r8d, eax
+	WORD $0xc639             // cmp    esi, eax
+	WORD $0x4d0f; BYTE $0xc6 // cmovge    eax, esi
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0xc689             // mov    esi, eax
+	WORD $0x394d; BYTE $0xd9 // cmp    r9, r11
+	JNE  LBB0_5
+
+LBB0_13:
+	WORD $0x0189             // mov    dword [rcx], eax
+	WORD $0x8944; BYTE $0x02 // mov    dword [rdx], r8d
+	RET
+
+LBB0_7:
+	LONG $0x4d6f0f66; BYTE $0x00 // movdqa    xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */
+	LONG $0x456f0f66; BYTE $0x10 // movdqa    xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */
+	WORD $0xc031                 // xor    eax, eax
+	LONG $0xd06f0f66             // movdqa    xmm2, xmm0
+	LONG $0xd96f0f66             // movdqa    xmm3, xmm1
+	LONG $0x01c0f641             // test    r8b, 1
+	JNE  LBB0_11
+	JMP  LBB0_12
+
+TEXT ·_uint32_max_min_sse4(SB), $0-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+
+	WORD $0xf685                   // test    esi, esi
+	JLE  LBB1_1
+	WORD $0x8941; BYTE $0xf1       // mov    r9d, esi
+	WORD $0xfe83; BYTE $0x07       // cmp    esi, 7
+	JA   LBB1_6
+	WORD $0x3145; BYTE $0xdb       // xor    r11d, r11d
+	LONG $0xffffb841; WORD $0xffff // mov    r8d, -1
+	WORD $0xf631                   // xor    esi, esi
+	JMP  LBB1_4
+
+LBB1_1:
+	LONG $0xffffb841; WORD $0xffff // mov    r8d, -1
+	WORD $0xf631                   // xor    esi, esi
+	JMP  LBB1_13
+
+LBB1_6:
+	WORD $0x8945; BYTE $0xcb // mov    r11d, r9d
+	LONG $0xf8e38341         // and    r11d, -8
+	LONG $0xf8438d49         // lea    rax, [r11 - 8]
+	WORD $0x8949; BYTE $0xc0 // mov    r8, rax
+	LONG $0x03e8c149         // shr    r8, 3
+	LONG $0x01c08349         // add    r8, 1
+	WORD $0x8548; BYTE $0xc0 // test    rax, rax
+	JE   LBB1_7
+	WORD $0x894d; BYTE $0xc2 // mov    r10, r8
+	LONG $0xfee28349         // and    r10, -2
+	WORD $0xf749; BYTE $0xda // neg    r10
+	LONG $0xc9ef0f66         // pxor    xmm1, xmm1
+	LONG $0xc0760f66         // pcmpeqd    xmm0, xmm0
+	WORD $0xc031             // xor    eax, eax
+	LONG $0xd2760f66         // pcmpeqd    xmm2, xmm2
+	LONG $0xdbef0f66         // pxor    xmm3, xmm3
+
+LBB1_9:
+	LONG $0x246f0ff3; BYTE $0x87   // movdqu    xmm4, oword [rdi + 4*rax]
+	LONG $0x6c6f0ff3; WORD $0x1087 // movdqu    xmm5, oword [rdi + 4*rax + 16]
+	LONG $0x746f0ff3; WORD $0x2087 // movdqu    xmm6, oword [rdi + 4*rax + 32]
+	LONG $0x7c6f0ff3; WORD $0x3087 // movdqu    xmm7, oword [rdi + 4*rax + 48]
+	LONG $0x3b380f66; BYTE $0xc4   // pminud    xmm0, xmm4
+	LONG $0x3b380f66; BYTE $0xd5   // pminud    xmm2, xmm5
+	LONG $0x3f380f66; BYTE $0xcc   // pmaxud    xmm1, xmm4
+	LONG $0x3f380f66; BYTE $0xdd   // pmaxud    xmm3, xmm5
+	LONG $0x3b380f66; BYTE $0xc6   // pminud    xmm0, xmm6
+	LONG $0x3b380f66; BYTE $0xd7   // pminud    xmm2, xmm7
+	LONG $0x3f380f66; BYTE $0xce   // pmaxud    xmm1, xmm6
+	LONG $0x3f380f66; BYTE $0xdf   // pmaxud    xmm3, xmm7
+	LONG $0x10c08348               // add    rax, 16
+	LONG $0x02c28349               // add    r10, 2
+	JNE  LBB1_9
+	LONG $0x01c0f641               // test    r8b, 1
+	JE   LBB1_12
+
+LBB1_11:
+	LONG $0x246f0ff3; BYTE $0x87   // movdqu    xmm4, oword [rdi + 4*rax]
+	LONG $0x6c6f0ff3; WORD $0x1087 // movdqu    xmm5, oword [rdi + 4*rax + 16]
+	LONG $0x3f380f66; BYTE $0xdd   // pmaxud    xmm3, xmm5
+	LONG $0x3f380f66; BYTE $0xcc   // pmaxud    xmm1, xmm4
+	LONG $0x3b380f66; BYTE $0xd5   // pminud    xmm2, xmm5
+	LONG $0x3b380f66; BYTE $0xc4   // pminud    xmm0, xmm4
+
+LBB1_12:
+	LONG $0x3b380f66; BYTE $0xc2 // pminud    xmm0, xmm2
+	LONG $0x3f380f66; BYTE $0xcb // pmaxud    xmm1, xmm3
+	LONG $0xd1700f66; BYTE $0x4e // pshufd    xmm2, xmm1, 78
+	LONG $0x3f380f66; BYTE $0xd1 // pmaxud    xmm2, xmm1
+	LONG $0xca700f66; BYTE $0xe5 // pshufd    xmm1, xmm2, 229
+	LONG $0x3f380f66; BYTE $0xca // pmaxud    xmm1, xmm2
+	LONG $0xce7e0f66             // movd    esi, xmm1
+	LONG $0xc8700f66; BYTE $0x4e // pshufd    xmm1, xmm0, 78
+	LONG $0x3b380f66; BYTE $0xc8 // pminud    xmm1, xmm0
+	LONG $0xc1700f66; BYTE $0xe5 // pshufd    xmm0, xmm1, 229
+	LONG $0x3b380f66; BYTE $0xc1 // pminud    xmm0, xmm1
+	LONG $0x7e0f4166; BYTE $0xc0 // movd    r8d, xmm0
+	WORD $0x394d; BYTE $0xcb     // cmp    r11, r9
+	JE   LBB1_13
+
+LBB1_4:
+	WORD $0xf089 // mov    eax, esi
+
+LBB1_5:
+	LONG $0x9f348b42         // mov    esi, dword [rdi + 4*r11]
+	WORD $0x3941; BYTE $0xf0 // cmp    r8d, esi
+	LONG $0xc6430f44         // cmovae    r8d, esi
+	WORD $0xf039             // cmp    eax, esi
+	WORD $0x470f; BYTE $0xf0 // cmova    esi, eax
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0xf089             // mov    eax, esi
+	WORD $0x394d; BYTE $0xd9 // cmp    r9, r11
+	JNE  LBB1_5
+
+LBB1_13:
+	WORD $0x3189             // mov    dword [rcx], esi
+	WORD $0x8944; BYTE $0x02 // mov    dword [rdx], r8d
+	RET
+
+LBB1_7:
+	LONG $0xc9ef0f66 // pxor    xmm1, xmm1
+	LONG $0xc0760f66 // pcmpeqd    xmm0, xmm0
+	WORD $0xc031     // xor    eax, eax
+	LONG $0xd2760f66 // pcmpeqd    xmm2, xmm2
+	LONG $0xdbef0f66 // pxor    xmm3, xmm3
+	LONG $0x01c0f641 // test    r8b, 1
+	JNE  LBB1_11
+	JMP  LBB1_12
+
+DATA LCDATA2<>+0x000(SB)/8, $0x8000000000000000
+DATA LCDATA2<>+0x008(SB)/8, $0x8000000000000000
+DATA LCDATA2<>+0x010(SB)/8, $0x7fffffffffffffff
+DATA LCDATA2<>+0x018(SB)/8, $0x7fffffffffffffff
+GLOBL LCDATA2<>(SB), 8, $32
+
+TEXT ·_int64_max_min_sse4(SB), $0-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+	LEAQ LCDATA2<>(SB), BP
+
+	QUAD $0xffffffffffffb849; WORD $0x7fff // mov    r8, 9223372036854775807
+	WORD $0xf685                           // test    esi, esi
+	JLE  LBB2_1
+	WORD $0x8941; BYTE $0xf1               // mov    r9d, esi
+	WORD $0xfe83; BYTE $0x03               // cmp    esi, 3
+	JA   LBB2_6
+	LONG $0x01708d49                       // lea    rsi, [r8 + 1]
+	WORD $0x3145; BYTE $0xdb               // xor    r11d, r11d
+	JMP  LBB2_4
+
+LBB2_1:
+	LONG $0x01708d49 // lea    rsi, [r8 + 1]
+	JMP  LBB2_13
+
+LBB2_6:
+	WORD $0x8945; BYTE $0xcb       // mov    r11d, r9d
+	LONG $0xfce38341               // and    r11d, -4
+	LONG $0xfc438d49               // lea    rax, [r11 - 4]
+	WORD $0x8949; BYTE $0xc0       // mov    r8, rax
+	LONG $0x02e8c149               // shr    r8, 2
+	LONG $0x01c08349               // add    r8, 1
+	WORD $0x8548; BYTE $0xc0       // test    rax, rax
+	JE   LBB2_7
+	WORD $0x894d; BYTE $0xc2       // mov    r10, r8
+	LONG $0xfee28349               // and    r10, -2
+	WORD $0xf749; BYTE $0xda       // neg    r10
+	LONG $0x6f0f4466; WORD $0x004d // movdqa    xmm9, oword 0[rbp] /* [rip + .LCPI2_0] */
+	LONG $0x6f0f4466; WORD $0x1045 // movdqa    xmm8, oword 16[rbp] /* [rip + .LCPI2_1] */
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0x6f0f4166; BYTE $0xd0   // movdqa    xmm2, xmm8
+	LONG $0x6f0f4166; BYTE $0xf1   // movdqa    xmm6, xmm9
+
+LBB2_9:
+	LONG $0x3c6f0ff3; BYTE $0xc7   // movdqu    xmm7, oword [rdi + 8*rax]
+	LONG $0xc76f0f66               // movdqa    xmm0, xmm7
+	LONG $0x380f4166; WORD $0xc037 // pcmpgtq    xmm0, xmm8
+	LONG $0xe76f0f66               // movdqa    xmm4, xmm7
+	LONG $0x380f4166; WORD $0xe015 // blendvpd    xmm4, xmm8, xmm0
+	LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu    xmm1, oword [rdi + 8*rax + 16]
+	LONG $0xc16f0f66               // movdqa    xmm0, xmm1
+	LONG $0x37380f66; BYTE $0xc2   // pcmpgtq    xmm0, xmm2
+	LONG $0xe96f0f66               // movdqa    xmm5, xmm1
+	LONG $0x15380f66; BYTE $0xea   // blendvpd    xmm5, xmm2, xmm0
+	LONG $0x6f0f4166; BYTE $0xc1   // movdqa    xmm0, xmm9
+	LONG $0x37380f66; BYTE $0xc7   // pcmpgtq    xmm0, xmm7
+	LONG $0x380f4166; WORD $0xf915 // blendvpd    xmm7, xmm9, xmm0
+	LONG $0xc66f0f66               // movdqa    xmm0, xmm6
+	LONG $0x37380f66; BYTE $0xc1   // pcmpgtq    xmm0, xmm1
+	LONG $0x15380f66; BYTE $0xce   // blendvpd    xmm1, xmm6, xmm0
+	LONG $0x5c6f0ff3; WORD $0x20c7 // movdqu    xmm3, oword [rdi + 8*rax + 32]
+	LONG $0xc36f0f66               // movdqa    xmm0, xmm3
+	LONG $0x37380f66; BYTE $0xc4   // pcmpgtq    xmm0, xmm4
+	LONG $0x6f0f4466; BYTE $0xc3   // movdqa    xmm8, xmm3
+	LONG $0x380f4466; WORD $0xc415 // blendvpd    xmm8, xmm4, xmm0
+	LONG $0x646f0ff3; WORD $0x30c7 // movdqu    xmm4, oword [rdi + 8*rax + 48]
+	LONG $0xc46f0f66               // movdqa    xmm0, xmm4
+	LONG $0x37380f66; BYTE $0xc5   // pcmpgtq    xmm0, xmm5
+	LONG $0xd46f0f66               // movdqa    xmm2, xmm4
+	LONG $0x15380f66; BYTE $0xd5   // blendvpd    xmm2, xmm5, xmm0
+	LONG $0xc7280f66               // movapd    xmm0, xmm7
+	LONG $0x37380f66; BYTE $0xc3   // pcmpgtq    xmm0, xmm3
+	LONG $0x15380f66; BYTE $0xdf   // blendvpd    xmm3, xmm7, xmm0
+	LONG $0xc1280f66               // movapd    xmm0, xmm1
+	LONG $0x37380f66; BYTE $0xc4   // pcmpgtq    xmm0, xmm4
+	LONG $0x15380f66; BYTE $0xe1   // blendvpd    xmm4, xmm1, xmm0
+	LONG $0x08c08348               // add    rax, 8
+	LONG $0x280f4466; BYTE $0xcb   // movapd    xmm9, xmm3
+	LONG $0xf4280f66               // movapd    xmm6, xmm4
+	LONG $0x02c28349               // add    r10, 2
+	JNE  LBB2_9
+	LONG $0x01c0f641               // test    r8b, 1
+	JE   LBB2_12
+
+LBB2_11:
+	LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu    xmm1, oword [rdi + 8*rax + 16]
+	LONG $0xc4280f66               // movapd    xmm0, xmm4
+	LONG $0x37380f66; BYTE $0xc1   // pcmpgtq    xmm0, xmm1
+	LONG $0xe96f0f66               // movdqa    xmm5, xmm1
+	LONG $0x15380f66; BYTE $0xec   // blendvpd    xmm5, xmm4, xmm0
+	LONG $0x246f0ff3; BYTE $0xc7   // movdqu    xmm4, oword [rdi + 8*rax]
+	LONG $0xc3280f66               // movapd    xmm0, xmm3
+	LONG $0x37380f66; BYTE $0xc4   // pcmpgtq    xmm0, xmm4
+	LONG $0xf46f0f66               // movdqa    xmm6, xmm4
+	LONG $0x15380f66; BYTE $0xf3   // blendvpd    xmm6, xmm3, xmm0
+	LONG $0xc16f0f66               // movdqa    xmm0, xmm1
+	LONG $0x37380f66; BYTE $0xc2   // pcmpgtq    xmm0, xmm2
+	LONG $0x15380f66; BYTE $0xca   // blendvpd    xmm1, xmm2, xmm0
+	LONG $0xc46f0f66               // movdqa    xmm0, xmm4
+	LONG $0x380f4166; WORD $0xc037 // pcmpgtq    xmm0, xmm8
+	LONG $0x380f4166; WORD $0xe015 // blendvpd    xmm4, xmm8, xmm0
+	LONG $0x280f4466; BYTE $0xc4   // movapd    xmm8, xmm4
+	LONG $0xd1280f66               // movapd    xmm2, xmm1
+	LONG $0xde280f66               // movapd    xmm3, xmm6
+	LONG $0xe5280f66               // movapd    xmm4, xmm5
+
+LBB2_12:
+	LONG $0xc3280f66               // movapd    xmm0, xmm3
+	LONG $0x37380f66; BYTE $0xc4   // pcmpgtq    xmm0, xmm4
+	LONG $0x15380f66; BYTE $0xe3   // blendvpd    xmm4, xmm3, xmm0
+	LONG $0xcc700f66; BYTE $0x4e   // pshufd    xmm1, xmm4, 78
+	LONG $0xc46f0f66               // movdqa    xmm0, xmm4
+	LONG $0x37380f66; BYTE $0xc1   // pcmpgtq    xmm0, xmm1
+	LONG $0x15380f66; BYTE $0xcc   // blendvpd    xmm1, xmm4, xmm0
+	LONG $0x7e0f4866; BYTE $0xce   // movq    rsi, xmm1
+	LONG $0xc26f0f66               // movdqa    xmm0, xmm2
+	LONG $0x380f4166; WORD $0xc037 // pcmpgtq    xmm0, xmm8
+	LONG $0x380f4166; WORD $0xd015 // blendvpd    xmm2, xmm8, xmm0
+	LONG $0xca700f66; BYTE $0x4e   // pshufd    xmm1, xmm2, 78
+	LONG $0xc16f0f66               // movdqa    xmm0, xmm1
+	LONG $0x37380f66; BYTE $0xc2   // pcmpgtq    xmm0, xmm2
+	LONG $0x15380f66; BYTE $0xca   // blendvpd    xmm1, xmm2, xmm0
+	LONG $0x7e0f4966; BYTE $0xc8   // movq    r8, xmm1
+	WORD $0x394d; BYTE $0xcb       // cmp    r11, r9
+	JE   LBB2_13
+
+LBB2_4:
+	WORD $0x8948; BYTE $0xf0 // mov    rax, rsi
+
+LBB2_5:
+	LONG $0xdf348b4a         // mov    rsi, qword [rdi + 8*r11]
+	WORD $0x3949; BYTE $0xf0 // cmp    r8, rsi
+	LONG $0xc64f0f4c         // cmovg    r8, rsi
+	WORD $0x3948; BYTE $0xf0 // cmp    rax, rsi
+	LONG $0xf04d0f48         // cmovge    rsi, rax
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0x8948; BYTE $0xf0 // mov    rax, rsi
+	WORD $0x394d; BYTE $0xd9 // cmp    r9, r11
+	JNE  LBB2_5
+
+LBB2_13:
+	WORD $0x8948; BYTE $0x31 // mov    qword [rcx], rsi
+	WORD $0x894c; BYTE $0x02 // mov    qword [rdx], r8
+	RET
+
+LBB2_7:
+	LONG $0x5d280f66; BYTE $0x00   // movapd    xmm3, oword 0[rbp] /* [rip + .LCPI2_0] */
+	LONG $0x6f0f4466; WORD $0x1045 // movdqa    xmm8, oword 16[rbp] /* [rip + .LCPI2_1] */
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0x6f0f4166; BYTE $0xd0   // movdqa    xmm2, xmm8
+	LONG $0xe3280f66               // movapd    xmm4, xmm3
+	LONG $0x01c0f641               // test    r8b, 1
+	JNE  LBB2_11
+	JMP  LBB2_12
+
+DATA LCDATA3<>+0x000(SB)/8, $0x8000000000000000
+DATA LCDATA3<>+0x008(SB)/8, $0x8000000000000000
+GLOBL LCDATA3<>(SB), 8, $16
+
+TEXT ·_uint64_max_min_sse4(SB), $0-32
+
+	MOVQ values+0(FP), DI
+	MOVQ length+8(FP), SI
+	MOVQ minout+16(FP), DX
+	MOVQ maxout+24(FP), CX
+	LEAQ LCDATA3<>(SB), BP
+
+	WORD $0xf685                               // test    esi, esi
+	JLE  LBB3_1
+	WORD $0x8941; BYTE $0xf1                   // mov    r9d, esi
+	WORD $0xfe83; BYTE $0x03                   // cmp    esi, 3
+	JA   LBB3_6
+	LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov    r8, -1
+	WORD $0x3145; BYTE $0xdb                   // xor    r11d, r11d
+	WORD $0xc031                               // xor    eax, eax
+	JMP  LBB3_4
+
+LBB3_1:
+	LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov    r8, -1
+	WORD $0xc031                               // xor    eax, eax
+	JMP  LBB3_13
+
+LBB3_6:
+	WORD $0x8945; BYTE $0xcb       // mov    r11d, r9d
+	LONG $0xfce38341               // and    r11d, -4
+	LONG $0xfc438d49               // lea    rax, [r11 - 4]
+	WORD $0x8949; BYTE $0xc0       // mov    r8, rax
+	LONG $0x02e8c149               // shr    r8, 2
+	LONG $0x01c08349               // add    r8, 1
+	WORD $0x8548; BYTE $0xc0       // test    rax, rax
+	JE   LBB3_7
+	WORD $0x894d; BYTE $0xc2       // mov    r10, r8
+	LONG $0xfee28349               // and    r10, -2
+	WORD $0xf749; BYTE $0xda       // neg    r10
+	LONG $0xef0f4566; BYTE $0xc9   // pxor    xmm9, xmm9
+	LONG $0x760f4566; BYTE $0xd2   // pcmpeqd    xmm10, xmm10
+	WORD $0xc031                   // xor    eax, eax
+	LONG $0x6f0f4466; WORD $0x0045 // movdqa    xmm8, oword 0[rbp] /* [rip + .LCPI3_0] */
+	LONG $0x760f4566; BYTE $0xdb   // pcmpeqd    xmm11, xmm11
+	LONG $0xef0f4566; BYTE $0xe4   // pxor    xmm12, xmm12
+
+LBB3_9:
+	LONG $0x6f0f4166; BYTE $0xd2               // movdqa    xmm2, xmm10
+	LONG $0xef0f4166; BYTE $0xd0               // pxor    xmm2, xmm8
+	LONG $0x246f0ff3; BYTE $0xc7               // movdqu    xmm4, oword [rdi + 8*rax]
+	LONG $0x6c6f0ff3; WORD $0x10c7             // movdqu    xmm5, oword [rdi + 8*rax + 16]
+	LONG $0x6f0f44f3; WORD $0xc76c; BYTE $0x20 // movdqu    xmm13, oword [rdi + 8*rax + 32]
+	LONG $0xc46f0f66                           // movdqa    xmm0, xmm4
+	LONG $0xef0f4166; BYTE $0xc0               // pxor    xmm0, xmm8
+	LONG $0x6f0f4166; BYTE $0xc9               // movdqa    xmm1, xmm9
+	LONG $0xef0f4166; BYTE $0xc8               // pxor    xmm1, xmm8
+	LONG $0x37380f66; BYTE $0xc8               // pcmpgtq    xmm1, xmm0
+	LONG $0x37380f66; BYTE $0xc2               // pcmpgtq    xmm0, xmm2
+	LONG $0xdc6f0f66                           // movdqa    xmm3, xmm4
+	LONG $0x380f4166; WORD $0xda15             // blendvpd    xmm3, xmm10, xmm0
+	LONG $0x746f0ff3; WORD $0x30c7             // movdqu    xmm6, oword [rdi + 8*rax + 48]
+	LONG $0x6f0f4166; BYTE $0xfb               // movdqa    xmm7, xmm11
+	LONG $0xef0f4166; BYTE $0xf8               // pxor    xmm7, xmm8
+	LONG $0xc56f0f66                           // movdqa    xmm0, xmm5
+	LONG $0xef0f4166; BYTE $0xc0               // pxor    xmm0, xmm8
+	LONG $0x6f0f4166; BYTE $0xd4               // movdqa    xmm2, xmm12
+	LONG $0xef0f4166; BYTE $0xd0               // pxor    xmm2, xmm8
+	LONG $0x37380f66; BYTE $0xd0               // pcmpgtq    xmm2, xmm0
+	LONG $0x37380f66; BYTE $0xc7               // pcmpgtq    xmm0, xmm7
+	LONG $0xfd6f0f66                           // movdqa    xmm7, xmm5
+	LONG $0x380f4166; WORD $0xfb15             // blendvpd    xmm7, xmm11, xmm0
+	LONG $0xc16f0f66                           // movdqa    xmm0, xmm1
+	LONG $0x380f4166; WORD $0xe115             // blendvpd    xmm4, xmm9, xmm0
+	LONG $0xc26f0f66                           // movdqa    xmm0, xmm2
+	LONG $0x380f4166; WORD $0xec15             // blendvpd    xmm5, xmm12, xmm0
+	LONG $0xd3280f66                           // movapd    xmm2, xmm3
+	LONG $0x570f4166; BYTE $0xd0               // xorpd    xmm2, xmm8
+	LONG $0x6f0f4166; BYTE $0xc5               // movdqa    xmm0, xmm13
+	LONG $0xef0f4166; BYTE $0xc0               // pxor    xmm0, xmm8
+	LONG $0xcc280f66                           // movapd    xmm1, xmm4
+	LONG $0x570f4166; BYTE $0xc8               // xorpd    xmm1, xmm8
+	LONG $0x37380f66; BYTE $0xc8               // pcmpgtq    xmm1, xmm0
+	LONG $0x37380f66; BYTE $0xc2               // pcmpgtq    xmm0, xmm2
+	LONG $0x6f0f4566; BYTE $0xd5               // movdqa    xmm10, xmm13
+	LONG $0x380f4466; WORD $0xd315             // blendvpd    xmm10, xmm3, xmm0
+	LONG $0xdf280f66                           // movapd    xmm3, xmm7
+	LONG $0x570f4166; BYTE $0xd8               // xorpd    xmm3, xmm8
+	LONG $0xc66f0f66                           // movdqa    xmm0, xmm6
+	LONG $0xef0f4166; BYTE $0xc0               // pxor    xmm0, xmm8
+	LONG $0xd5280f66                           // movapd    xmm2, xmm5
+	LONG $0x570f4166; BYTE $0xd0               // xorpd    xmm2, xmm8
+	LONG $0x37380f66; BYTE $0xd0               // pcmpgtq    xmm2, xmm0
+	LONG $0x37380f66; BYTE $0xc3               // pcmpgtq    xmm0, xmm3
+	LONG $0x6f0f4466; BYTE $0xde               // movdqa    xmm11, xmm6
+	LONG $0x380f4466; WORD $0xdf15             // blendvpd    xmm11, xmm7, xmm0
+	LONG $0xc16f0f66                           // movdqa    xmm0, xmm1
+	LONG $0x380f4466; WORD $0xec15             // blendvpd    xmm13, xmm4, xmm0
+	LONG $0xc26f0f66                           // movdqa    xmm0, xmm2
+	LONG $0x15380f66; BYTE $0xf5               // blendvpd    xmm6, xmm5, xmm0
+	LONG $0x08c08348                           // add    rax, 8
+	LONG $0x280f4566; BYTE $0xcd               // movapd    xmm9, xmm13
+	LONG $0x280f4466; BYTE $0xe6               // movapd    xmm12, xmm6
+	LONG $0x02c28349                           // add    r10, 2
+	JNE  LBB3_9
+	LONG $0x01c0f641                           // test    r8b, 1
+	JE   LBB3_12
+
+LBB3_11:
+	LONG $0x24100f66; BYTE $0xc7   // movupd    xmm4, oword [rdi + 8*rax]
+	LONG $0x5c100f66; WORD $0x10c7 // movupd    xmm3, oword [rdi + 8*rax + 16]
+	LONG $0x6d280f66; BYTE $0x00   // movapd    xmm5, oword 0[rbp] /* [rip + .LCPI3_0] */
+	LONG $0xc6280f66               // movapd    xmm0, xmm6
+	LONG $0xc5570f66               // xorpd    xmm0, xmm5
+	LONG $0xcb280f66               // movapd    xmm1, xmm3
+	LONG $0xcd570f66               // xorpd    xmm1, xmm5
+	LONG $0x37380f66; BYTE $0xc1   // pcmpgtq    xmm0, xmm1
+	LONG $0xfb280f66               // movapd    xmm7, xmm3
+	LONG $0x15380f66; BYTE $0xfe   // blendvpd    xmm7, xmm6, xmm0
+	LONG $0x280f4166; BYTE $0xc5   // movapd    xmm0, xmm13
+	LONG $0xc5570f66               // xorpd    xmm0, xmm5
+	LONG $0xd4280f66               // movapd    xmm2, xmm4
+	LONG $0xd5570f66               // xorpd    xmm2, xmm5
+	LONG $0x37380f66; BYTE $0xc2   // pcmpgtq    xmm0, xmm2
+	LONG $0xf4280f66               // movapd    xmm6, xmm4
+	LONG $0x380f4166; WORD $0xf515 // blendvpd    xmm6, xmm13, xmm0
+	LONG $0x280f4166; BYTE $0xc3   // movapd    xmm0, xmm11
+	LONG $0xc5570f66               // xorpd    xmm0, xmm5
+	LONG $0x37380f66; BYTE $0xc8   // pcmpgtq    xmm1, xmm0
+	LONG $0xc16f0f66               // movdqa    xmm0, xmm1
+	LONG $0x380f4166; WORD $0xdb15 // blendvpd    xmm3, xmm11, xmm0
+	LONG $0x570f4166; BYTE $0xea   // xorpd    xmm5, xmm10
+	LONG $0x37380f66; BYTE $0xd5   // pcmpgtq    xmm2, xmm5
+	LONG $0xc26f0f66               // movdqa    xmm0, xmm2
+	LONG $0x380f4166; WORD $0xe215 // blendvpd    xmm4, xmm10, xmm0
+	LONG $0x280f4466; BYTE $0xd4   // movapd    xmm10, xmm4
+	LONG $0x280f4466; BYTE $0xdb   // movapd    xmm11, xmm3
+	LONG $0x280f4466; BYTE $0xee   // movapd    xmm13, xmm6
+	LONG $0xf7280f66               // movapd    xmm6, xmm7
+
+LBB3_12:
+	LONG $0x4d280f66; BYTE $0x00   // movapd    xmm1, oword 0[rbp] /* [rip + .LCPI3_0] */
+	LONG $0xd6280f66               // movapd    xmm2, xmm6
+	LONG $0xd1570f66               // xorpd    xmm2, xmm1
+	LONG $0x280f4166; BYTE $0xc5   // movapd    xmm0, xmm13
+	LONG $0xc1570f66               // xorpd    xmm0, xmm1
+	LONG $0x37380f66; BYTE $0xc2   // pcmpgtq    xmm0, xmm2
+	LONG $0x380f4166; WORD $0xf515 // blendvpd    xmm6, xmm13, xmm0
+	LONG $0xd6700f66; BYTE $0x4e   // pshufd    xmm2, xmm6, 78
+	LONG $0xc6280f66               // movapd    xmm0, xmm6
+	LONG $0xc1570f66               // xorpd    xmm0, xmm1
+	LONG $0xda6f0f66               // movdqa    xmm3, xmm2
+	LONG $0xd9ef0f66               // pxor    xmm3, xmm1
+	LONG $0x37380f66; BYTE $0xc3   // pcmpgtq    xmm0, xmm3
+	LONG $0x15380f66; BYTE $0xd6   // blendvpd    xmm2, xmm6, xmm0
+	LONG $0x7e0f4866; BYTE $0xd0   // movq    rax, xmm2
+	LONG $0x6f0f4166; BYTE $0xd2   // movdqa    xmm2, xmm10
+	LONG $0xd1ef0f66               // pxor    xmm2, xmm1
+	LONG $0x6f0f4166; BYTE $0xc3   // movdqa    xmm0, xmm11
+	LONG $0xc1ef0f66               // pxor    xmm0, xmm1
+	LONG $0x37380f66; BYTE $0xc2   // pcmpgtq    xmm0, xmm2
+	LONG $0x380f4566; WORD $0xda15 // blendvpd    xmm11, xmm10, xmm0
+	LONG $0x700f4166; WORD $0x4ed3 // pshufd    xmm2, xmm11, 78
+	LONG $0x6f0f4166; BYTE $0xc3   // movdqa    xmm0, xmm11
+	LONG $0xc1ef0f66               // pxor    xmm0, xmm1
+	LONG $0xcaef0f66               // pxor    xmm1, xmm2
+	LONG $0x37380f66; BYTE $0xc8   // pcmpgtq    xmm1, xmm0
+	LONG $0xc16f0f66               // movdqa    xmm0, xmm1
+	LONG $0x380f4166; WORD $0xd315 // blendvpd    xmm2, xmm11, xmm0
+	LONG $0x7e0f4966; BYTE $0xd0   // movq    r8, xmm2
+	WORD $0x394d; BYTE $0xcb       // cmp    r11, r9
+	JE   LBB3_13
+
+LBB3_4:
+	WORD $0x8948; BYTE $0xc6 // mov    rsi, rax
+
+LBB3_5:
+	LONG $0xdf048b4a         // mov    rax, qword [rdi + 8*r11]
+	WORD $0x3949; BYTE $0xc0 // cmp    r8, rax
+	LONG $0xc0430f4c         // cmovae    r8, rax
+	WORD $0x3948; BYTE $0xc6 // cmp    rsi, rax
+	LONG $0xc6470f48         // cmova    rax, rsi
+	LONG $0x01c38349         // add    r11, 1
+	WORD $0x8948; BYTE $0xc6 // mov    rsi, rax
+	WORD $0x394d; BYTE $0xd9 // cmp    r9, r11
+	JNE  LBB3_5
+
+LBB3_13:
+	WORD $0x8948; BYTE $0x01 // mov    qword [rcx], rax
+	WORD $0x894c; BYTE $0x02 // mov    qword [rdx], r8
+	RET
+
+LBB3_7:
+	LONG $0x570f4566; BYTE $0xed // xorpd    xmm13, xmm13
+	LONG $0x760f4566; BYTE $0xd2 // pcmpeqd    xmm10, xmm10
+	WORD $0xc031                 // xor    eax, eax
+	LONG $0x760f4566; BYTE $0xdb // pcmpeqd    xmm11, xmm11
+	LONG $0xf6570f66             // xorpd    xmm6, xmm6
+	LONG $0x01c0f641             // test    r8b, 1
+	JNE  LBB3_11
+	JMP  LBB3_12
diff --git a/go/parquet/internal/utils/physical_types.tmpldata b/go/parquet/internal/utils/physical_types.tmpldata
new file mode 100644
index 00000000000..0adeb9955bf
--- /dev/null
+++ b/go/parquet/internal/utils/physical_types.tmpldata
@@ -0,0 +1,52 @@
+[
+  {
+    "Name": "Int32",
+    "name": "int32",
+    "lower": "int32",
+    "prefix": "arrow"
+  },
+  {
+    "Name": "Int64",
+    "name": "int64",
+    "lower": "int64",
+    "prefix": "arrow"
+  },
+  {
+    "Name": "Int96",
+    "name": "parquet.Int96",
+    "lower": "int96",
+    "prefix": "parquet"
+  },
+  {
+    "Name": "Float32",
+    "name": "float32",
+    "lower": "float32",
+    "prefix": "arrow",
+    "physical": "Float"
+  },
+  {
+    "Name": "Float64",
+    "name": "float64",
+    "lower": "float64",
+    "prefix": "arrow",
+    "physical": "Double"
+  },
+  {
+    "Name": "Boolean",
+    "name": "bool",
+    "lower": "bool",
+    "prefix": "arrow"
+  },
+  {
+    "Name": "ByteArray",
+    "name": "parquet.ByteArray",
+    "lower": "byteArray",
+    "prefix": "parquet"
+  },
+  {
+    "Name": "FixedLenByteArray",
+    "name": "parquet.FixedLenByteArray",
+    "lower": "fixedLenByteArray",
+    "prefix": "parquet"
+  }
+]
diff --git a/go/parquet/internal/utils/rle.go b/go/parquet/internal/utils/rle.go
new file mode 100644
index 00000000000..d31dd1d1371
--- /dev/null
+++ b/go/parquet/internal/utils/rle.go
@@ -0,0 +1,583 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package utils contains various internal utilities for the parquet library
+// that aren't intended to be exposed to external consumers such as interfaces
+// and bitmap readers/writers including the RLE encoder/decoder and so on.
+package utils
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"math"
+
+	"github.com/apache/arrow/go/arrow/bitutil"
+	"github.com/apache/arrow/go/parquet"
+	"golang.org/x/xerrors"
+)
+
+//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata typed_rle_dict.gen.go.tmpl
+
+const (
+	MaxValuesPerLiteralRun = (1 << 6) * 8
+)
+
+func MinBufferSize(bitWidth int) int {
+	maxLiteralRunSize := 1 + bitutil.BytesForBits(int64(MaxValuesPerLiteralRun*bitWidth))
+	maxRepeatedRunSize := binary.MaxVarintLen32 + bitutil.BytesForBits(int64(bitWidth))
+	return int(Max(maxLiteralRunSize, maxRepeatedRunSize))
+}
+
+func MaxBufferSize(width, numValues int) int {
+	bytesPerRun := width
+	numRuns := int(bitutil.BytesForBits(int64(numValues)))
+	literalMaxSize := numRuns + (numRuns * bytesPerRun)
+
+	minRepeatedRunSize := 1 + int(bitutil.BytesForBits(int64(width)))
+	repeatedMaxSize := int(bitutil.BytesForBits(int64(numValues))) * minRepeatedRunSize
+
+	return MaxInt(literalMaxSize, repeatedMaxSize)
+}
+
+// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
+// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+// (literal encoding).
+// For both types of runs, there is a byte-aligned indicator which encodes the length
+// of the run and the type of the run.
+// This encoding has the benefit that when there aren't any long enough runs, values
+// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+// the run length are byte aligned. This allows for very efficient decoding
+// implementations.
+// The encoding is:
+//    encoded-block := run*
+//    run := literal-run | repeated-run
+//    literal-run := literal-indicator < literal bytes >
+//    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+//    literal-indicator := varint_encode( number_of_groups << 1 | 1)
+//    repeated-indicator := varint_encode( number_of_repetitions << 1 )
+//
+// Each run is preceded by a varint. The varint's least significant bit is
+// used to indicate whether the run is a literal run or a repeated run. The rest
+// of the varint is used to determine the length of the run (eg how many times the
+// value repeats).
+//
+// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+// on a byte boundary without padding.
+// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+// the actual number of encoded ints. (This means that the total number of encoded values
+// can not be determined from the encoded data, since the number of values in the last
+// group may not be a multiple of 8). For the last group of literal runs, we pad
+// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+// without the need for additional checks.
+//
+// There is a break-even point when it is more storage efficient to do run length
+// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
+// for both the repeated encoding or the literal encoding.  This value can always
+// be computed based on the bit-width.
+//
+// Examples with bit-width 1 (eg encoding booleans):
+// ----------------------------------------
+// 100 1s followed by 100 0s:
+// <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
+//  - (total 4 bytes)
+//
+// alternating 1s and 0s (200 total):
+// 200 ints = 25 groups of 8
+// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+// (total 26 bytes, 1 byte overhead)
+//
+
+type RleDecoder struct {
+	r *BitReader
+
+	bitWidth int
+	curVal   uint64
+	repCount int32
+	litCount int32
+}
+
+func NewRleDecoder(data *bytes.Reader, width int) *RleDecoder {
+	return &RleDecoder{r: NewBitReader(data), bitWidth: width}
+}
+
+func (r *RleDecoder) Reset(data *bytes.Reader, width int) {
+	r.bitWidth = width
+	r.curVal = 0
+	r.repCount = 0
+	r.litCount = 0
+	r.r.Reset(data)
+}
+
+func (r *RleDecoder) Next() bool {
+	indicator, ok := r.r.GetVlqInt()
+	if !ok {
+		return false
+	}
+
+	literal := (indicator & 1) != 0
+	count := uint32(indicator >> 1)
+	if literal {
+		if count == 0 || count > uint32(math.MaxInt32/8) {
+			return false
+		}
+		r.litCount = int32(count) * 8
+	} else {
+		if count == 0 || count > uint32(math.MaxInt32) {
+			return false
+		}
+		r.repCount = int32(count)
+
+		nbytes := int(bitutil.BytesForBits(int64(r.bitWidth)))
+		switch {
+		case nbytes > 4:
+			if !r.r.GetAligned(nbytes, &r.curVal) {
+				return false
+			}
+		case nbytes > 2:
+			var val uint32
+			if !r.r.GetAligned(nbytes, &val) {
+				return false
+			}
+			r.curVal = uint64(val)
+		case nbytes > 1:
+			var val uint16
+			if !r.r.GetAligned(nbytes, &val) {
+				return false
+			}
+			r.curVal = uint64(val)
+		default:
+			var val uint8
+			if !r.r.GetAligned(nbytes, &val) {
+				return false
+			}
+			r.curVal = uint64(val)
+		}
+	}
+	return true
+}
+
+func (r *RleDecoder) GetValue() (uint64, bool) {
+	vals := make([]uint64, 1)
+	n := r.GetBatch(vals)
+	return vals[0], n == 1
+}
+
+func (r *RleDecoder) GetBatch(values []uint64) int {
+	read := 0
+	size := len(values)
+
+	out := values
+	for read < size {
+		remain := size - read
+
+		if r.repCount > 0 {
+			repbatch := int(math.Min(float64(remain), float64(r.repCount)))
+			for i := 0; i < repbatch; i++ {
+				out[i] = r.curVal
+			}
+
+			r.repCount -= int32(repbatch)
+			read += repbatch
+			out = out[repbatch:]
+		} else if r.litCount > 0 {
+			litbatch := int(math.Min(float64(remain), float64(r.litCount)))
+			n, _ := r.r.GetBatch(uint(r.bitWidth), out[:litbatch])
+			if n != litbatch {
+				return read
+			}
+
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			out = out[litbatch:]
+		} else {
+			if !r.Next() {
+				return read
+			}
+		}
+	}
+	return read
+}
+
+func (r *RleDecoder) GetBatchSpaced(vals []uint64, nullcount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullcount == 0 {
+		return r.GetBatch(vals), nil
+	}
+
+	converter := plainConverter{}
+	blockCounter := NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+
+	var (
+		totalProcessed int
+		processed      int
+		block          BitBlockCount
+		err            error
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		if block.AllSet() {
+			processed = r.GetBatch(vals[:block.Len])
+		} else if block.NoneSet() {
+			converter.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		} else {
+			processed, err = r.getspaced(converter, vals, int(block.Len), int(block.Len-block.Popcnt), validBits, validBitsOffset)
+			if err != nil {
+				return totalProcessed, err
+			}
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return totalProcessed, nil
+}
+
+func (r *RleDecoder) getspaced(dc DictionaryConverter, vals interface{}, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	switch vals := vals.(type) {
+	case []int32:
+		return r.getspacedInt32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	case []int64:
+		return r.getspacedInt64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	case []float32:
+		return r.getspacedFloat32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	case []float64:
+		return r.getspacedFloat64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	case []parquet.ByteArray:
+		return r.getspacedByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	case []parquet.FixedLenByteArray:
+		return r.getspacedFixedLenByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	case []parquet.Int96:
+		return r.getspacedInt96(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	case []uint64:
+		return r.getspacedUint64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
+	default:
+		return 0, xerrors.New("parquet/rle: getspaced invalid type")
+	}
+}
+
+func (r *RleDecoder) getspacedUint64(dc DictionaryConverter, vals []uint64, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsUint64(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeRepeatCounts(read, batchSize, remain int, run BitRun, bitRdr BitRunReader) (int, int, BitRun) {
+	// Consume the entire repeat counts incrementing repeat_batch to
+	// be the total of nulls + values consumed, we only need to
+	// get the total count because we can fill in the same value for
+	// nulls and non-nulls. This proves to be a big efficiency win.
+	repeatBatch := 0
+	for r.repCount > 0 && (read+repeatBatch) < batchSize {
+		if run.Set {
+			updateSize := int(Min(run.Len, int64(r.repCount)))
+			r.repCount -= int32(updateSize)
+			repeatBatch += updateSize
+			run.Len -= int64(updateSize)
+			remain -= updateSize
+		} else {
+			repeatBatch += int(run.Len)
+			run.Len = 0
+		}
+
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	return repeatBatch, remain, run
+}
+
+func (r *RleDecoder) consumeLiteralsUint64(dc DictionaryConverter, vals []uint64, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDict(dc DictionaryConverter, vals interface{}) (int, error) {
+	switch vals := vals.(type) {
+	case []int32:
+		return r.GetBatchWithDictInt32(dc, vals)
+	case []int64:
+		return r.GetBatchWithDictInt64(dc, vals)
+	case []float32:
+		return r.GetBatchWithDictFloat32(dc, vals)
+	case []float64:
+		return r.GetBatchWithDictFloat64(dc, vals)
+	case []parquet.ByteArray:
+		return r.GetBatchWithDictByteArray(dc, vals)
+	case []parquet.FixedLenByteArray:
+		return r.GetBatchWithDictFixedLenByteArray(dc, vals)
+	case []parquet.Int96:
+		return r.GetBatchWithDictInt96(dc, vals)
+	default:
+		return 0, xerrors.New("parquet/rle: GetBatchWithDict invalid type")
+	}
+}
+
+func (r *RleDecoder) GetBatchWithDictSpaced(dc DictionaryConverter, vals interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	switch vals := vals.(type) {
+	case []int32:
+		return r.GetBatchWithDictSpacedInt32(dc, vals, nullCount, validBits, validBitsOffset)
+	case []int64:
+		return r.GetBatchWithDictSpacedInt64(dc, vals, nullCount, validBits, validBitsOffset)
+	case []float32:
+		return r.GetBatchWithDictSpacedFloat32(dc, vals, nullCount, validBits, validBitsOffset)
+	case []float64:
+		return r.GetBatchWithDictSpacedFloat64(dc, vals, nullCount, validBits, validBitsOffset)
+	case []parquet.ByteArray:
+		return r.GetBatchWithDictSpacedByteArray(dc, vals, nullCount, validBits, validBitsOffset)
+	case []parquet.FixedLenByteArray:
+		return r.GetBatchWithDictSpacedFixedLenByteArray(dc, vals, nullCount, validBits, validBitsOffset)
+	case []parquet.Int96:
+		return r.GetBatchWithDictSpacedInt96(dc, vals, nullCount, validBits, validBitsOffset)
+	default:
+		return 0, xerrors.New("parquet/rle: GetBatchWithDictSpaced invalid type")
+	}
+}
+
+type RleEncoder struct {
+	w *BitWriter
+
+	buffer                 []uint64
+	BitWidth               int
+	curVal                 uint64
+	repCount               int32
+	litCount               int32
+	literalIndicatorOffset int
+
+	indicatorBuffer [1]byte
+}
+
+func NewRleEncoder(w io.WriterAt, width int) *RleEncoder {
+	return &RleEncoder{
+		w:                      NewBitWriter(w),
+		buffer:                 make([]uint64, 0, 8),
+		BitWidth:               width,
+		literalIndicatorOffset: -1,
+	}
+}
+
+func (r *RleEncoder) Flush() int {
+	if r.litCount > 0 || r.repCount > 0 || len(r.buffer) > 0 {
+		allRep := r.litCount == 0 && (r.repCount == int32(len(r.buffer)) || len(r.buffer) == 0)
+		if r.repCount > 0 && allRep {
+			r.flushRepeated()
+		} else {
+			// buffer the last grou pof literals to 8 by padding with 0s
+			for len(r.buffer) != 0 && len(r.buffer) < 8 {
+				r.buffer = append(r.buffer, 0)
+			}
+
+			r.litCount += int32(len(r.buffer))
+			r.flushLiteral(true)
+			r.repCount = 0
+		}
+	}
+	r.w.Flush(false)
+	return r.w.Written()
+}
+
+func (r *RleEncoder) flushBuffered(done bool) {
+	if r.repCount >= 8 {
+		// clear buffered values. they are part of the repeated run now and we
+		// don't want to flush them as literals
+		r.buffer = r.buffer[:0]
+		if r.litCount != 0 {
+			// there was  current literal run. all values flushed but need to update the indicator
+			r.flushLiteral(true)
+		}
+		return
+	}
+
+	r.litCount += int32(len(r.buffer))
+	ngroups := r.litCount / 8
+	if ngroups+1 >= (1 << 6) {
+		// we need to start a new literal run because the indicator byte we've reserved
+		// cannot store any more values
+		r.flushLiteral(true)
+	} else {
+		r.flushLiteral(done)
+	}
+	r.repCount = 0
+}
+
+func (r *RleEncoder) flushLiteral(updateIndicator bool) {
+	if r.literalIndicatorOffset == -1 {
+		r.literalIndicatorOffset = r.w.ReserveBytes(1)
+	}
+
+	for _, val := range r.buffer {
+		r.w.WriteValue(val, uint(r.BitWidth))
+	}
+	r.buffer = r.buffer[:0]
+
+	if updateIndicator {
+		// at this point we need to write the indicator byte for the literal run.
+		// we only reserve one byte, to allow for streaming writes of literal values.
+		// the logic makes sure we flush literal runs often enough to not overrun the 1 byte.
+		ngroups := r.litCount / 8
+		r.indicatorBuffer[0] = byte((ngroups << 1) | 1)
+		r.w.WriteAt(r.indicatorBuffer[:], int64(r.literalIndicatorOffset))
+		r.literalIndicatorOffset = -1
+		r.litCount = 0
+	}
+}
+
+func (r *RleEncoder) flushRepeated() {
+	indicator := r.repCount << 1
+	r.w.WriteVlqInt(uint64(indicator))
+	r.w.WriteAligned(r.curVal, int(bitutil.BytesForBits(int64(r.BitWidth))))
+
+	r.repCount = 0
+	r.buffer = r.buffer[:0]
+}
+
+// Put buffers input values 8 at a time. after seeing all 8 values,
+// it decides whether they should be encoded as a literal or repeated run.
+func (r *RleEncoder) Put(value uint64) bool {
+
+	if r.curVal == value {
+		r.repCount++
+		if r.repCount > 8 {
+			// this is just a continuation of the current run, no need to buffer the values
+			// NOTE this is the fast path for long repeated runs
+			return true
+		}
+	} else {
+		if r.repCount >= 8 {
+			r.flushRepeated()
+		}
+		r.repCount = 1
+		r.curVal = value
+	}
+
+	r.buffer = append(r.buffer, value)
+	if len(r.buffer) == 8 {
+		r.flushBuffered(false)
+	}
+	return true
+}
+
+func (r *RleEncoder) Clear() {
+	r.curVal = 0
+	r.repCount = 0
+	r.buffer = r.buffer[:0]
+	r.litCount = 0
+	r.literalIndicatorOffset = -1
+	r.w.Clear()
+}
diff --git a/go/parquet/internal/utils/typed_rle_dict.gen.go b/go/parquet/internal/utils/typed_rle_dict.gen.go
new file mode 100644
index 00000000000..a20f551a53e
--- /dev/null
+++ b/go/parquet/internal/utils/typed_rle_dict.gen.go
@@ -0,0 +1,1375 @@
+// Code generated by typed_rle_dict.gen.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"github.com/apache/arrow/go/parquet"
+	"golang.org/x/xerrors"
+)
+
+func (r *RleDecoder) GetBatchWithDictSpacedInt32(dc DictionaryConverter, vals []int32, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+	if nullCount == 0 {
+		return r.GetBatchWithDictInt32(dc, vals)
+	}
+
+	var (
+		blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+		processed    = 0
+		block        BitBlockCount
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		switch {
+		case block.AllSet():
+			processed, err = r.GetBatchWithDictInt32(dc, vals[:block.Len])
+		case block.NoneSet():
+			dc.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		default:
+			processed, err = r.getspacedInt32(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+		}
+
+		if err != nil {
+			break
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return
+}
+
+func (r *RleDecoder) getspacedInt32(dc DictionaryConverter, vals []int32, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsInt32(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeLiteralsInt32(dc DictionaryConverter, vals []int32, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictInt32(dc DictionaryConverter, vals []int32) (int, error) {
+	var (
+		read        = 0
+		size        = len(vals)
+		indexbuffer [1024]IndexType
+	)
+
+	for read < size {
+		remain := size - read
+
+		switch {
+		case r.repCount > 0:
+			idx := IndexType(r.curVal)
+			if !dc.IsValid(idx) {
+				return read, nil
+			}
+			batch := MinInt(remain, int(r.repCount))
+			if err := dc.Fill(vals[:batch], idx); err != nil {
+				return read, err
+			}
+			r.repCount -= int32(batch)
+			read += batch
+			vals = vals[batch:]
+		case r.litCount > 0:
+			litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+			buf := indexbuffer[:litbatch]
+			n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+			if n != litbatch {
+				return read, nil
+			}
+			if !dc.IsValid(buf...) {
+				return read, nil
+			}
+			if err := dc.Copy(vals, buf); err != nil {
+				return read, nil
+			}
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			vals = vals[litbatch:]
+		default:
+			if !r.Next() {
+				return read, nil
+			}
+		}
+	}
+
+	return read, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictSpacedInt64(dc DictionaryConverter, vals []int64, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+	if nullCount == 0 {
+		return r.GetBatchWithDictInt64(dc, vals)
+	}
+
+	var (
+		blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+		processed    = 0
+		block        BitBlockCount
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		switch {
+		case block.AllSet():
+			processed, err = r.GetBatchWithDictInt64(dc, vals[:block.Len])
+		case block.NoneSet():
+			dc.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		default:
+			processed, err = r.getspacedInt64(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+		}
+
+		if err != nil {
+			break
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return
+}
+
+func (r *RleDecoder) getspacedInt64(dc DictionaryConverter, vals []int64, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsInt64(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeLiteralsInt64(dc DictionaryConverter, vals []int64, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictInt64(dc DictionaryConverter, vals []int64) (int, error) {
+	var (
+		read        = 0
+		size        = len(vals)
+		indexbuffer [1024]IndexType
+	)
+
+	for read < size {
+		remain := size - read
+
+		switch {
+		case r.repCount > 0:
+			idx := IndexType(r.curVal)
+			if !dc.IsValid(idx) {
+				return read, nil
+			}
+			batch := MinInt(remain, int(r.repCount))
+			if err := dc.Fill(vals[:batch], idx); err != nil {
+				return read, err
+			}
+			r.repCount -= int32(batch)
+			read += batch
+			vals = vals[batch:]
+		case r.litCount > 0:
+			litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+			buf := indexbuffer[:litbatch]
+			n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+			if n != litbatch {
+				return read, nil
+			}
+			if !dc.IsValid(buf...) {
+				return read, nil
+			}
+			if err := dc.Copy(vals, buf); err != nil {
+				return read, nil
+			}
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			vals = vals[litbatch:]
+		default:
+			if !r.Next() {
+				return read, nil
+			}
+		}
+	}
+
+	return read, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictSpacedInt96(dc DictionaryConverter, vals []parquet.Int96, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+	if nullCount == 0 {
+		return r.GetBatchWithDictInt96(dc, vals)
+	}
+
+	var (
+		blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+		processed    = 0
+		block        BitBlockCount
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		switch {
+		case block.AllSet():
+			processed, err = r.GetBatchWithDictInt96(dc, vals[:block.Len])
+		case block.NoneSet():
+			dc.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		default:
+			processed, err = r.getspacedInt96(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+		}
+
+		if err != nil {
+			break
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return
+}
+
+func (r *RleDecoder) getspacedInt96(dc DictionaryConverter, vals []parquet.Int96, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsInt96(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeLiteralsInt96(dc DictionaryConverter, vals []parquet.Int96, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictInt96(dc DictionaryConverter, vals []parquet.Int96) (int, error) {
+	var (
+		read        = 0
+		size        = len(vals)
+		indexbuffer [1024]IndexType
+	)
+
+	for read < size {
+		remain := size - read
+
+		switch {
+		case r.repCount > 0:
+			idx := IndexType(r.curVal)
+			if !dc.IsValid(idx) {
+				return read, nil
+			}
+			batch := MinInt(remain, int(r.repCount))
+			if err := dc.Fill(vals[:batch], idx); err != nil {
+				return read, err
+			}
+			r.repCount -= int32(batch)
+			read += batch
+			vals = vals[batch:]
+		case r.litCount > 0:
+			litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+			buf := indexbuffer[:litbatch]
+			n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+			if n != litbatch {
+				return read, nil
+			}
+			if !dc.IsValid(buf...) {
+				return read, nil
+			}
+			if err := dc.Copy(vals, buf); err != nil {
+				return read, nil
+			}
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			vals = vals[litbatch:]
+		default:
+			if !r.Next() {
+				return read, nil
+			}
+		}
+	}
+
+	return read, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictSpacedFloat32(dc DictionaryConverter, vals []float32, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+	if nullCount == 0 {
+		return r.GetBatchWithDictFloat32(dc, vals)
+	}
+
+	var (
+		blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+		processed    = 0
+		block        BitBlockCount
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		switch {
+		case block.AllSet():
+			processed, err = r.GetBatchWithDictFloat32(dc, vals[:block.Len])
+		case block.NoneSet():
+			dc.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		default:
+			processed, err = r.getspacedFloat32(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+		}
+
+		if err != nil {
+			break
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return
+}
+
+func (r *RleDecoder) getspacedFloat32(dc DictionaryConverter, vals []float32, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsFloat32(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeLiteralsFloat32(dc DictionaryConverter, vals []float32, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictFloat32(dc DictionaryConverter, vals []float32) (int, error) {
+	var (
+		read        = 0
+		size        = len(vals)
+		indexbuffer [1024]IndexType
+	)
+
+	for read < size {
+		remain := size - read
+
+		switch {
+		case r.repCount > 0:
+			idx := IndexType(r.curVal)
+			if !dc.IsValid(idx) {
+				return read, nil
+			}
+			batch := MinInt(remain, int(r.repCount))
+			if err := dc.Fill(vals[:batch], idx); err != nil {
+				return read, err
+			}
+			r.repCount -= int32(batch)
+			read += batch
+			vals = vals[batch:]
+		case r.litCount > 0:
+			litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+			buf := indexbuffer[:litbatch]
+			n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+			if n != litbatch {
+				return read, nil
+			}
+			if !dc.IsValid(buf...) {
+				return read, nil
+			}
+			if err := dc.Copy(vals, buf); err != nil {
+				return read, nil
+			}
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			vals = vals[litbatch:]
+		default:
+			if !r.Next() {
+				return read, nil
+			}
+		}
+	}
+
+	return read, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictSpacedFloat64(dc DictionaryConverter, vals []float64, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+	if nullCount == 0 {
+		return r.GetBatchWithDictFloat64(dc, vals)
+	}
+
+	var (
+		blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+		processed    = 0
+		block        BitBlockCount
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		switch {
+		case block.AllSet():
+			processed, err = r.GetBatchWithDictFloat64(dc, vals[:block.Len])
+		case block.NoneSet():
+			dc.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		default:
+			processed, err = r.getspacedFloat64(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+		}
+
+		if err != nil {
+			break
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return
+}
+
+func (r *RleDecoder) getspacedFloat64(dc DictionaryConverter, vals []float64, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsFloat64(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeLiteralsFloat64(dc DictionaryConverter, vals []float64, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictFloat64(dc DictionaryConverter, vals []float64) (int, error) {
+	var (
+		read        = 0
+		size        = len(vals)
+		indexbuffer [1024]IndexType
+	)
+
+	for read < size {
+		remain := size - read
+
+		switch {
+		case r.repCount > 0:
+			idx := IndexType(r.curVal)
+			if !dc.IsValid(idx) {
+				return read, nil
+			}
+			batch := MinInt(remain, int(r.repCount))
+			if err := dc.Fill(vals[:batch], idx); err != nil {
+				return read, err
+			}
+			r.repCount -= int32(batch)
+			read += batch
+			vals = vals[batch:]
+		case r.litCount > 0:
+			litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+			buf := indexbuffer[:litbatch]
+			n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+			if n != litbatch {
+				return read, nil
+			}
+			if !dc.IsValid(buf...) {
+				return read, nil
+			}
+			if err := dc.Copy(vals, buf); err != nil {
+				return read, nil
+			}
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			vals = vals[litbatch:]
+		default:
+			if !r.Next() {
+				return read, nil
+			}
+		}
+	}
+
+	return read, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictSpacedByteArray(dc DictionaryConverter, vals []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+	if nullCount == 0 {
+		return r.GetBatchWithDictByteArray(dc, vals)
+	}
+
+	var (
+		blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+		processed    = 0
+		block        BitBlockCount
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		switch {
+		case block.AllSet():
+			processed, err = r.GetBatchWithDictByteArray(dc, vals[:block.Len])
+		case block.NoneSet():
+			dc.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		default:
+			processed, err = r.getspacedByteArray(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+		}
+
+		if err != nil {
+			break
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return
+}
+
+func (r *RleDecoder) getspacedByteArray(dc DictionaryConverter, vals []parquet.ByteArray, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsByteArray(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeLiteralsByteArray(dc DictionaryConverter, vals []parquet.ByteArray, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictByteArray(dc DictionaryConverter, vals []parquet.ByteArray) (int, error) {
+	var (
+		read        = 0
+		size        = len(vals)
+		indexbuffer [1024]IndexType
+	)
+
+	for read < size {
+		remain := size - read
+
+		switch {
+		case r.repCount > 0:
+			idx := IndexType(r.curVal)
+			if !dc.IsValid(idx) {
+				return read, nil
+			}
+			batch := MinInt(remain, int(r.repCount))
+			if err := dc.Fill(vals[:batch], idx); err != nil {
+				return read, err
+			}
+			r.repCount -= int32(batch)
+			read += batch
+			vals = vals[batch:]
+		case r.litCount > 0:
+			litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+			buf := indexbuffer[:litbatch]
+			n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+			if n != litbatch {
+				return read, nil
+			}
+			if !dc.IsValid(buf...) {
+				return read, nil
+			}
+			if err := dc.Copy(vals, buf); err != nil {
+				return read, nil
+			}
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			vals = vals[litbatch:]
+		default:
+			if !r.Next() {
+				return read, nil
+			}
+		}
+	}
+
+	return read, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictSpacedFixedLenByteArray(dc DictionaryConverter, vals []parquet.FixedLenByteArray, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+	if nullCount == 0 {
+		return r.GetBatchWithDictFixedLenByteArray(dc, vals)
+	}
+
+	var (
+		blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+		processed    = 0
+		block        BitBlockCount
+	)
+
+	for {
+		block = blockCounter.NextFourWords()
+		if block.Len == 0 {
+			break
+		}
+
+		switch {
+		case block.AllSet():
+			processed, err = r.GetBatchWithDictFixedLenByteArray(dc, vals[:block.Len])
+		case block.NoneSet():
+			dc.FillZero(vals[:block.Len])
+			processed = int(block.Len)
+		default:
+			processed, err = r.getspacedFixedLenByteArray(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+		}
+
+		if err != nil {
+			break
+		}
+
+		totalProcessed += processed
+		vals = vals[int(block.Len):]
+		validBitsOffset += int64(block.Len)
+		if processed != int(block.Len) {
+			break
+		}
+	}
+	return
+}
+
+func (r *RleDecoder) getspacedFixedLenByteArray(dc DictionaryConverter, vals []parquet.FixedLenByteArray, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+	if nullCount == batchSize {
+		dc.FillZero(vals[:batchSize])
+		return batchSize, nil
+	}
+
+	read := 0
+	remain := batchSize - nullCount
+
+	const bufferSize = 1024
+	var indexbuffer [bufferSize]IndexType
+
+	// assume no bits to start
+	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+	validRun := bitReader.NextRun()
+	for read < batchSize {
+		if validRun.Len == 0 {
+			validRun = bitReader.NextRun()
+		}
+
+		if !validRun.Set {
+			dc.FillZero(vals[:int(validRun.Len)])
+			vals = vals[int(validRun.Len):]
+			read += int(validRun.Len)
+			validRun.Len = 0
+			continue
+		}
+
+		if r.repCount == 0 && r.litCount == 0 {
+			if !r.Next() {
+				return read, nil
+			}
+		}
+
+		var batch int
+		switch {
+		case r.repCount > 0:
+			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+			current := IndexType(r.curVal)
+			if !dc.IsValid(current) {
+				return read, nil
+			}
+			dc.Fill(vals[:batch], current)
+		case r.litCount > 0:
+			var (
+				litread int
+				skipped int
+				err     error
+			)
+			litread, skipped, validRun, err = r.consumeLiteralsFixedLenByteArray(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+			if err != nil {
+				return read, err
+			}
+			batch = litread + skipped
+			remain -= litread
+		}
+
+		vals = vals[batch:]
+		read += batch
+	}
+	return read, nil
+}
+
+func (r *RleDecoder) consumeLiteralsFixedLenByteArray(dc DictionaryConverter, vals []parquet.FixedLenByteArray, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDictFixedLenByteArray(dc DictionaryConverter, vals []parquet.FixedLenByteArray) (int, error) {
+	var (
+		read        = 0
+		size        = len(vals)
+		indexbuffer [1024]IndexType
+	)
+
+	for read < size {
+		remain := size - read
+
+		switch {
+		case r.repCount > 0:
+			idx := IndexType(r.curVal)
+			if !dc.IsValid(idx) {
+				return read, nil
+			}
+			batch := MinInt(remain, int(r.repCount))
+			if err := dc.Fill(vals[:batch], idx); err != nil {
+				return read, err
+			}
+			r.repCount -= int32(batch)
+			read += batch
+			vals = vals[batch:]
+		case r.litCount > 0:
+			litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+			buf := indexbuffer[:litbatch]
+			n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+			if n != litbatch {
+				return read, nil
+			}
+			if !dc.IsValid(buf...) {
+				return read, nil
+			}
+			if err := dc.Copy(vals, buf); err != nil {
+				return read, nil
+			}
+			r.litCount -= int32(litbatch)
+			read += litbatch
+			vals = vals[litbatch:]
+		default:
+			if !r.Next() {
+				return read, nil
+			}
+		}
+	}
+
+	return read, nil
+}
diff --git a/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl b/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl
new file mode 100644
index 00000000000..f511167b1fb
--- /dev/null
+++ b/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl
@@ -0,0 +1,218 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+  "github.com/apache/arrow/go/parquet"
+)
+
+{{range .In}}
+{{if ne .Name "Boolean"}}
+func (r *RleDecoder) GetBatchWithDictSpaced{{.Name}}(dc DictionaryConverter, vals []{{.name}}, nullCount int, validBits []byte, validBitsOffset int64) (totalProcessed int, err error) {
+  if nullCount == 0 {
+    return r.GetBatchWithDict{{.Name}}(dc, vals)
+  }
+
+  var (
+    blockCounter = NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
+    processed = 0
+    block BitBlockCount
+  )
+
+  for {
+    block = blockCounter.NextFourWords()
+    if block.Len == 0 {
+      break
+    }
+
+    switch {
+    case block.AllSet():
+      processed, err = r.GetBatchWithDict{{.Name}}(dc, vals[:block.Len])
+    case block.NoneSet():
+      dc.FillZero(vals[:block.Len])
+      processed = int(block.Len)
+    default:
+      processed, err = r.getspaced{{.Name}}(dc, vals, int(block.Len), int(block.Len)-int(block.Popcnt), validBits, validBitsOffset)
+    }
+
+    if err != nil {
+      break
+    }
+
+    totalProcessed += processed
+    vals = vals[int(block.Len):]
+    validBitsOffset += int64(block.Len)
+    if processed != int(block.Len) {
+      break
+    }
+  }
+  return
+}
+
+func (r *RleDecoder) getspaced{{.Name}}(dc DictionaryConverter, vals []{{.name}}, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
+  if nullCount == batchSize {
+    dc.FillZero(vals[:batchSize])
+    return batchSize, nil
+  }
+
+  read := 0
+  remain := batchSize - nullCount
+
+  const bufferSize = 1024
+  var indexbuffer [bufferSize]IndexType
+
+  // assume no bits to start
+  bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
+  validRun := bitReader.NextRun()
+  for read < batchSize {
+    if validRun.Len == 0 {
+      validRun = bitReader.NextRun()
+    }
+
+    if !validRun.Set {
+      dc.FillZero(vals[:int(validRun.Len)])
+      vals = vals[int(validRun.Len):]
+      read += int(validRun.Len)
+      validRun.Len = 0
+      continue
+    }
+
+    if r.repCount == 0 && r.litCount == 0 {
+      if !r.Next() {
+        return read, nil
+      }
+    }
+
+    var batch int
+    switch {
+    case r.repCount > 0:
+      batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
+      current := IndexType(r.curVal)
+      if !dc.IsValid(current) {
+        return read, nil
+      }
+      dc.Fill(vals[:batch], current)
+    case r.litCount > 0:
+      var (
+        litread int
+        skipped int
+        err error
+      )
+      litread, skipped, validRun, err = r.consumeLiterals{{.Name}}(dc, vals, remain, indexbuffer[:], validRun, bitReader)
+    	if err != nil {
+        return read, err
+      }
+      batch = litread + skipped
+      remain -= litread
+    }
+
+    vals = vals[batch:]
+    read += batch
+  }
+  return read, nil
+}
+
+func (r *RleDecoder) consumeLiterals{{.Name}}(dc DictionaryConverter, vals []{{.name}}, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
+  batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
+	buf = buf[:batch]
+
+	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+	if n != batch {
+		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
+	}
+
+	if !dc.IsValid(buf...) {
+		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
+	}
+
+	var (
+		read    int
+		skipped int
+	)
+	for read < batch {
+		if run.Set {
+			updateSize := MinInt(batch-read, int(run.Len))
+			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
+				return 0, 0, run, err
+			}
+			read += updateSize
+			vals = vals[updateSize:]
+			run.Len -= int64(updateSize)
+		} else {
+			dc.FillZero(vals[:int(run.Len)])
+			vals = vals[int(run.Len):]
+			skipped += int(run.Len)
+			run.Len = 0
+		}
+		if run.Len == 0 {
+			run = bitRdr.NextRun()
+		}
+	}
+	r.litCount -= int32(batch)
+	return read, skipped, run, nil
+}
+
+func (r *RleDecoder) GetBatchWithDict{{.Name}}(dc DictionaryConverter, vals []{{.name}}) (int, error) {
+  var (
+    read = 0
+    size = len(vals)
+    indexbuffer [1024]IndexType
+  )
+
+  for read < size {
+    remain := size - read
+
+    switch {
+    case r.repCount > 0:
+      idx := IndexType(r.curVal)
+      if !dc.IsValid(idx) {
+        return read, nil
+      }
+      batch := MinInt(remain, int(r.repCount))
+      if err := dc.Fill(vals[:batch], idx); err != nil {
+        return read, err
+      }
+      r.repCount -= int32(batch)
+      read += batch
+      vals = vals[batch:]
+    case r.litCount > 0:
+      litbatch := MinInt(MinInt(remain, int(r.litCount)), 1024)
+      buf := indexbuffer[:litbatch]
+      n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
+      if n != litbatch {
+        return read, nil
+      }
+      if !dc.IsValid(buf...) {
+        return read, nil
+      }
+      if err := dc.Copy(vals, buf); err != nil {
+        return read, nil
+      }
+      r.litCount -= int32(litbatch)
+      read += litbatch
+      vals = vals[litbatch:]
+    default:
+      if !r.Next() {
+        return read, nil
+      }
+    }
+  }
+
+  return read, nil
+}
+{{end}}
+{{end}}
diff --git a/go/parquet/internal/utils/unpack_bool.go b/go/parquet/internal/utils/unpack_bool.go
new file mode 100644
index 00000000000..3ccb0b7bfb7
--- /dev/null
+++ b/go/parquet/internal/utils/unpack_bool.go
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+// pure go implementation of converting a bitmap to a slice of bools
+func bytesToBoolsGo(in []byte, out []bool) {
+	for i, b := range in {
+		for j := 0; j < 8; j++ {
+			out[8*i+j] = (b & (1 << j)) != 0
+		}
+	}
+}
diff --git a/go/parquet/internal/utils/unpack_bool_amd64.go b/go/parquet/internal/utils/unpack_bool_amd64.go
new file mode 100644
index 00000000000..1e9680db4b2
--- /dev/null
+++ b/go/parquet/internal/utils/unpack_bool_amd64.go
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import "golang.org/x/sys/cpu"
+
+var byteToBoolFunc func([]byte, []bool)
+
+func init() {
+	// if the cpu supports AVX2 or SSE4 then use SIMD to accelerate the conversion
+	// of a bitmap to a slice of bools in an optimized fashion, otherwise fallback
+	// to the pure go implementation
+	if cpu.X86.HasAVX2 {
+		byteToBoolFunc = bytesToBoolsAVX2
+	} else if cpu.X86.HasSSE42 {
+		byteToBoolFunc = bytesToBoolsSSE4
+	} else {
+		byteToBoolFunc = bytesToBoolsGo
+	}
+}
+
+// BytesToBools efficiently populates a slice of booleans from an input bitmap
+func BytesToBools(in []byte, out []bool) {
+	byteToBoolFunc(in, out)
+}
diff --git a/go/parquet/internal/utils/unpack_bool_avx2.go b/go/parquet/internal/utils/unpack_bool_avx2.go
new file mode 100644
index 00000000000..e0065e5aad1
--- /dev/null
+++ b/go/parquet/internal/utils/unpack_bool_avx2.go
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import "unsafe"
+
+//go:noescape
+func _bytes_to_bools_avx2(in unsafe.Pointer, len int, out unsafe.Pointer, outlen int)
+
+// use AVX2 to SIMD accelerate the conversion of bitmap to boolean slice
+func bytesToBoolsAVX2(in []byte, out []bool) {
+	_bytes_to_bools_avx2(unsafe.Pointer(&in[0]), len(in), unsafe.Pointer(&out[0]), len(out))
+}
diff --git a/go/parquet/internal/utils/unpack_bool_avx2.s b/go/parquet/internal/utils/unpack_bool_avx2.s
new file mode 100644
index 00000000000..99c2cc88265
--- /dev/null
+++ b/go/parquet/internal/utils/unpack_bool_avx2.s
@@ -0,0 +1,6961 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+DATA LCDATA1<>+0x000(SB)/8, $0x0000001900000018
+DATA LCDATA1<>+0x008(SB)/8, $0x0000001b0000001a
+DATA LCDATA1<>+0x010(SB)/8, $0x0000001d0000001c
+DATA LCDATA1<>+0x018(SB)/8, $0x0000001f0000001e
+DATA LCDATA1<>+0x020(SB)/8, $0x0000001100000010
+DATA LCDATA1<>+0x028(SB)/8, $0x0000001300000012
+DATA LCDATA1<>+0x030(SB)/8, $0x0000001500000014
+DATA LCDATA1<>+0x038(SB)/8, $0x0000001700000016
+DATA LCDATA1<>+0x040(SB)/8, $0x0000000900000008
+DATA LCDATA1<>+0x048(SB)/8, $0x0000000b0000000a
+DATA LCDATA1<>+0x050(SB)/8, $0x0000000d0000000c
+DATA LCDATA1<>+0x058(SB)/8, $0x0000000f0000000e
+DATA LCDATA1<>+0x060(SB)/8, $0x0000000100000000
+DATA LCDATA1<>+0x068(SB)/8, $0x0000000300000002
+DATA LCDATA1<>+0x070(SB)/8, $0x0000000500000004
+DATA LCDATA1<>+0x078(SB)/8, $0x0000000700000006
+DATA LCDATA1<>+0x080(SB)/8, $0x0101010101010101
+DATA LCDATA1<>+0x088(SB)/8, $0x0101010101010101
+DATA LCDATA1<>+0x090(SB)/8, $0x0101010101010101
+DATA LCDATA1<>+0x098(SB)/8, $0x0101010101010101
+DATA LCDATA1<>+0x0a0(SB)/8, $0x0000000000000001
+DATA LCDATA1<>+0x0a8(SB)/8, $0x0000000000000002
+DATA LCDATA1<>+0x0b0(SB)/8, $0x0000000000000003
+DATA LCDATA1<>+0x0b8(SB)/8, $0x0000000000000004
+DATA LCDATA1<>+0x0c0(SB)/8, $0x0000000000000005
+DATA LCDATA1<>+0x0c8(SB)/8, $0x0000000000000006
+DATA LCDATA1<>+0x0d0(SB)/8, $0x0000000000000007
+DATA LCDATA1<>+0x0d8(SB)/8, $0x0000000000000020
+GLOBL LCDATA1<>(SB), 8, $224
+
+TEXT ·_bytes_to_bools_avx2(SB), $1000-32
+
+	MOVQ in+0(FP), DI
+	MOVQ len+8(FP), SI
+	MOVQ out+16(FP), DX
+	MOVQ outlen+24(FP), CX
+	MOVQ SP, BP
+	ADDQ $32, SP
+	ANDQ $-32, SP
+	MOVQ BP, 960(SP)
+	LEAQ LCDATA1<>(SB), BP
+
+	WORD $0xf685             // test    esi, esi
+	JLE  LBB0_1051
+	WORD $0x8941; BYTE $0xc9 // mov    r9d, ecx
+	WORD $0x8949; BYTE $0xd0 // mov    r8, rdx
+	WORD $0x8941; BYTE $0xf2 // mov    r10d, esi
+	WORD $0xfe83; BYTE $0x20 // cmp    esi, 32
+	JAE  LBB0_3
+
+LBB0_2:
+	WORD $0x3145; BYTE $0xe4 // xor    r12d, r12d
+
+LBB0_1055:
+	QUAD $0x00000000e50c8d42 // lea    ecx, [8*r12]
+	JMP  LBB0_1057
+
+LBB0_1056:
+	LONG $0x01c48349         // add    r12, 1
+	WORD $0xc183; BYTE $0x08 // add    ecx, 8
+	WORD $0x394d; BYTE $0xe2 // cmp    r10, r12
+	JE   LBB0_1051
+
+LBB0_1057:
+	WORD $0xca89                 // mov    edx, ecx
+	WORD $0xc989                 // mov    ecx, ecx
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x14b60f42; BYTE $0x27 // movzx    edx, byte [rdi + r12]
+	WORD $0xe280; BYTE $0x01     // and    dl, 1
+	LONG $0x08148841             // mov    byte [r8 + rcx], dl
+	WORD $0x8948; BYTE $0xca     // mov    rdx, rcx
+	LONG $0x01ca8348             // or    rdx, 1
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x1cb60f42; BYTE $0x27 // movzx    ebx, byte [rdi + r12]
+	WORD $0xebd0                 // shr    bl, 1
+	WORD $0xe380; BYTE $0x01     // and    bl, 1
+	LONG $0x101c8841             // mov    byte [r8 + rdx], bl
+	WORD $0x8948; BYTE $0xca     // mov    rdx, rcx
+	LONG $0x02ca8348             // or    rdx, 2
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x1cb60f42; BYTE $0x27 // movzx    ebx, byte [rdi + r12]
+	WORD $0xebc0; BYTE $0x02     // shr    bl, 2
+	WORD $0xe380; BYTE $0x01     // and    bl, 1
+	LONG $0x101c8841             // mov    byte [r8 + rdx], bl
+	WORD $0x8948; BYTE $0xca     // mov    rdx, rcx
+	LONG $0x03ca8348             // or    rdx, 3
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x1cb60f42; BYTE $0x27 // movzx    ebx, byte [rdi + r12]
+	WORD $0xebc0; BYTE $0x03     // shr    bl, 3
+	WORD $0xe380; BYTE $0x01     // and    bl, 1
+	LONG $0x101c8841             // mov    byte [r8 + rdx], bl
+	WORD $0x8948; BYTE $0xca     // mov    rdx, rcx
+	LONG $0x04ca8348             // or    rdx, 4
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x1cb60f42; BYTE $0x27 // movzx    ebx, byte [rdi + r12]
+	WORD $0xebc0; BYTE $0x04     // shr    bl, 4
+	WORD $0xe380; BYTE $0x01     // and    bl, 1
+	LONG $0x101c8841             // mov    byte [r8 + rdx], bl
+	WORD $0x8948; BYTE $0xca     // mov    rdx, rcx
+	LONG $0x05ca8348             // or    rdx, 5
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x1cb60f42; BYTE $0x27 // movzx    ebx, byte [rdi + r12]
+	WORD $0xebc0; BYTE $0x05     // shr    bl, 5
+	WORD $0xe380; BYTE $0x01     // and    bl, 1
+	LONG $0x101c8841             // mov    byte [r8 + rdx], bl
+	WORD $0x8948; BYTE $0xca     // mov    rdx, rcx
+	LONG $0x06ca8348             // or    rdx, 6
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x1cb60f42; BYTE $0x27 // movzx    ebx, byte [rdi + r12]
+	WORD $0xebc0; BYTE $0x06     // shr    bl, 6
+	WORD $0xe380; BYTE $0x01     // and    bl, 1
+	LONG $0x101c8841             // mov    byte [r8 + rdx], bl
+	WORD $0x8948; BYTE $0xca     // mov    rdx, rcx
+	LONG $0x07ca8348             // or    rdx, 7
+	WORD $0x3944; BYTE $0xca     // cmp    edx, r9d
+	JGE  LBB0_1056
+	LONG $0x1cb60f42; BYTE $0x27 // movzx    ebx, byte [rdi + r12]
+	WORD $0xebc0; BYTE $0x07     // shr    bl, 7
+	LONG $0x101c8841             // mov    byte [r8 + rdx], bl
+	JMP  LBB0_1056
+
+LBB0_3:
+	LONG $0x244c8944; BYTE $0x10 // mov    dword [rsp + 16], r9d
+	LONG $0x2454894c; BYTE $0x30 // mov    qword [rsp + 48], r10
+	LONG $0xff728d49             // lea    rsi, [r10 - 1]
+	LONG $0x000008b9; BYTE $0x00 // mov    ecx, 8
+	WORD $0xf089                 // mov    eax, esi
+	WORD $0xe1f7                 // mul    ecx
+	LONG $0xd6900f41             // seto    r14b
+	WORD $0x8948; BYTE $0xf3     // mov    rbx, rsi
+	LONG $0x20ebc148             // shr    rbx, 32
+	LONG $0x06488d49             // lea    rcx, [r8 + 6]
+	LONG $0x000008ba; BYTE $0x00 // mov    edx, 8
+	WORD $0x8948; BYTE $0xf0     // mov    rax, rsi
+	WORD $0xf748; BYTE $0xe2     // mul    rdx
+	LONG $0xd6900f40             // seto    sil
+	WORD $0x0148; BYTE $0xc1     // add    rcx, rax
+	WORD $0x920f; BYTE $0xd2     // setb    dl
+	LONG $0x07488d49             // lea    rcx, [r8 + 7]
+	WORD $0x0148; BYTE $0xc1     // add    rcx, rax
+	LONG $0xd5920f41             // setb    r13b
+	LONG $0x05488d49             // lea    rcx, [r8 + 5]
+	WORD $0x0148; BYTE $0xc1     // add    rcx, rax
+	LONG $0xd1920f41             // setb    r9b
+	LONG $0x04488d49             // lea    rcx, [r8 + 4]
+	WORD $0x0148; BYTE $0xc1     // add    rcx, rax
+	LONG $0xd7920f41             // setb    r15b
+	LONG $0x03488d49             // lea    rcx, [r8 + 3]
+	WORD $0x0148; BYTE $0xc1     // add    rcx, rax
+	LONG $0xd3920f41             // setb    r11b
+	LONG $0x02488d49             // lea    rcx, [r8 + 2]
+	WORD $0x0148; BYTE $0xc1     // add    rcx, rax
+	LONG $0xd2920f41             // setb    r10b
+	LONG $0x01488d49             // lea    rcx, [r8 + 1]
+	WORD $0x0148; BYTE $0xc1     // add    rcx, rax
+	WORD $0x920f; BYTE $0xd1     // setb    cl
+	WORD $0x014c; BYTE $0xc0     // add    rax, r8
+	WORD $0x920f; BYTE $0xd0     // setb    al
+	WORD $0x3145; BYTE $0xe4     // xor    r12d, r12d
+	WORD $0x8548; BYTE $0xdb     // test    rbx, rbx
+	JNE  LBB0_1052
+	WORD $0x8445; BYTE $0xf6     // test    r14b, r14b
+	JNE  LBB0_1052
+	WORD $0xd284                 // test    dl, dl
+	JNE  LBB0_1052
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	JNE  LBB0_1052
+	WORD $0x8445; BYTE $0xed     // test    r13b, r13b
+	JNE  LBB0_1052
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	JNE  LBB0_1052
+	WORD $0x8445; BYTE $0xc9     // test    r9b, r9b
+	JNE  LBB0_1052
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	JNE  LBB0_1052
+	WORD $0x8445; BYTE $0xff     // test    r15b, r15b
+	JNE  LBB0_1052
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	JNE  LBB0_1052
+	WORD $0x8445; BYTE $0xdb     // test    r11b, r11b
+	JNE  LBB0_1052
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	JNE  LBB0_1052
+	WORD $0x8445; BYTE $0xd2     // test    r10b, r10b
+	JNE  LBB0_1052
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	LONG $0x24548b4c; BYTE $0x30 // mov    r10, qword [rsp + 48]
+	JNE  LBB0_1054
+	WORD $0xc984                 // test    cl, cl
+	JNE  LBB0_1054
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	LONG $0x244c8b44; BYTE $0x10 // mov    r9d, dword [rsp + 16]
+	JNE  LBB0_1055
+	WORD $0xc084                 // test    al, al
+	JNE  LBB0_1055
+	WORD $0x8440; BYTE $0xf6     // test    sil, sil
+	JNE  LBB0_1055
+	LONG $0xd0048d4b             // lea    rax, [r8 + 8*r10]
+	WORD $0x3948; BYTE $0xf8     // cmp    rax, rdi
+	JBE  LBB0_24
+	LONG $0x17048d4a             // lea    rax, [rdi + r10]
+	WORD $0x394c; BYTE $0xc0     // cmp    rax, r8
+	JA   LBB0_2
+
+LBB0_24:
+	WORD $0x8945; BYTE $0xd4             // mov    r12d, r10d
+	LONG $0xe0e48341                     // and    r12d, -32
+	LONG $0x6e79c1c4; BYTE $0xc1         // vmovd    xmm0, r9d
+	LONG $0x587de2c4; BYTE $0xc0         // vpbroadcastd    ymm0, xmm0
+	LONG $0x4d6f7dc5; BYTE $0x00         // vmovdqa    ymm9, yword 0[rbp] /* [rip + .LCPI0_0] */
+	LONG $0x456f7dc5; BYTE $0x20         // vmovdqa    ymm8, yword 32[rbp] /* [rip + .LCPI0_1] */
+	LONG $0x5d6ffdc5; BYTE $0x40         // vmovdqa    ymm3, yword 64[rbp] /* [rip + .LCPI0_2] */
+	LONG $0x556ffdc5; BYTE $0x60         // vmovdqa    ymm2, yword 96[rbp] /* [rip + .LCPI0_3] */
+	WORD $0x3145; BYTE $0xdb             // xor    r11d, r11d
+	QUAD $0x0000a08d197de2c4; BYTE $0x00 // vbroadcastsd    ymm1, qword 160[rbp] /* [rip + .LCPI0_5] */
+	QUAD $0x000300248c29fcc5; BYTE $0x00 // vmovaps    yword [rsp + 768], ymm1
+	QUAD $0x0000a88d197de2c4; BYTE $0x00 // vbroadcastsd    ymm1, qword 168[rbp] /* [rip + .LCPI0_6] */
+	QUAD $0x0002e0248c29fcc5; BYTE $0x00 // vmovaps    yword [rsp + 736], ymm1
+	QUAD $0x0000b08d197de2c4; BYTE $0x00 // vbroadcastsd    ymm1, qword 176[rbp] /* [rip + .LCPI0_7] */
+	QUAD $0x0002c0248c29fcc5; BYTE $0x00 // vmovaps    yword [rsp + 704], ymm1
+	QUAD $0x0000b88d197de2c4; BYTE $0x00 // vbroadcastsd    ymm1, qword 184[rbp] /* [rip + .LCPI0_8] */
+	QUAD $0x0002a0248c29fcc5; BYTE $0x00 // vmovaps    yword [rsp + 672], ymm1
+	QUAD $0x0000c08d197de2c4; BYTE $0x00 // vbroadcastsd    ymm1, qword 192[rbp] /* [rip + .LCPI0_9] */
+	QUAD $0x000280248c29fcc5; BYTE $0x00 // vmovaps    yword [rsp + 640], ymm1
+	QUAD $0x0000c88d197de2c4; BYTE $0x00 // vbroadcastsd    ymm1, qword 200[rbp] /* [rip + .LCPI0_10] */
+	QUAD $0x000260248c29fcc5; BYTE $0x00 // vmovaps    yword [rsp + 608], ymm1
+	QUAD $0x0000d08d197de2c4; BYTE $0x00 // vbroadcastsd    ymm1, qword 208[rbp] /* [rip + .LCPI0_11] */
+	QUAD $0x000240248c29fcc5; BYTE $0x00 // vmovaps    yword [rsp + 576], ymm1
+	QUAD $0x0000d88d587de2c4; BYTE $0x00 // vpbroadcastd    ymm1, dword 216[rbp] /* [rip + .LCPI0_12] */
+	QUAD $0x000220248c7ffdc5; BYTE $0x00 // vmovdqa    yword [rsp + 544], ymm1
+	JMP  LBB0_26
+
+LBB0_25:
+	LONG $0x20c38349                     // add    r11, 32
+	QUAD $0x000220248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 544]
+	LONG $0xd1feedc5                     // vpaddd    ymm2, ymm2, ymm1
+	LONG $0xd9fee5c5                     // vpaddd    ymm3, ymm3, ymm1
+	LONG $0xc1fe3dc5                     // vpaddd    ymm8, ymm8, ymm1
+	LONG $0xc9fe35c5                     // vpaddd    ymm9, ymm9, ymm1
+	WORD $0x394d; BYTE $0xe3             // cmp    r11, r12
+	JE   LBB0_1050
+
+LBB0_26:
+	QUAD $0x00032024947ffdc5; BYTE $0x00 // vmovdqa    yword [rsp + 800], ymm2
+	LONG $0xf272f5c5; BYTE $0x03         // vpslld    ymm1, ymm2, 3
+	LONG $0xd166f9c5                     // vpcmpgtd    xmm2, xmm0, xmm1
+	LONG $0xd17ef9c5                     // vmovd    ecx, xmm2
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_28
+	LONG $0x787da2c4; WORD $0x1f24       // vpbroadcastb    ymm4, byte [rdi + r11]
+
+LBB0_28:
+	WORD $0x894d; BYTE $0xda                   // mov    r10, r11
+	LONG $0x01ca8349                           // or    r10, 1
+	LONG $0xd166f9c5                           // vpcmpgtd    xmm2, xmm0, xmm1
+	LONG $0xd26be9c5                           // vpackssdw    xmm2, xmm2, xmm2
+	LONG $0xd263e9c5                           // vpacksswb    xmm2, xmm2, xmm2
+	LONG $0x1479e3c4; WORD $0x01d1             // vpextrb    ecx, xmm2, 1
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_30
+	LONG $0x2059a3c4; WORD $0x1714; BYTE $0x01 // vpinsrb    xmm2, xmm4, byte [rdi + r10], 1
+	LONG $0x025de3c4; WORD $0x0fe2             // vpblendd    ymm4, ymm4, ymm2, 15
+
+LBB0_30:
+	WORD $0x894d; BYTE $0xde                   // mov    r14, r11
+	LONG $0x02ce8349                           // or    r14, 2
+	LONG $0xd166f9c5                           // vpcmpgtd    xmm2, xmm0, xmm1
+	LONG $0xd26be9c5                           // vpackssdw    xmm2, xmm2, xmm2
+	LONG $0xd263e9c5                           // vpacksswb    xmm2, xmm2, xmm2
+	LONG $0x1479e3c4; WORD $0x02d1             // vpextrb    ecx, xmm2, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_32
+	LONG $0x2059a3c4; WORD $0x3714; BYTE $0x02 // vpinsrb    xmm2, xmm4, byte [rdi + r14], 2
+	LONG $0x025de3c4; WORD $0x0fe2             // vpblendd    ymm4, ymm4, ymm2, 15
+
+LBB0_32:
+	LONG $0x397dc3c4; WORD $0x01cd             // vextracti128    xmm13, ymm1, 1
+	WORD $0x894c; BYTE $0xda                   // mov    rdx, r11
+	LONG $0x03ca8348                           // or    rdx, 3
+	LONG $0xd166f9c5                           // vpcmpgtd    xmm2, xmm0, xmm1
+	LONG $0xd26be9c5                           // vpackssdw    xmm2, xmm2, xmm2
+	LONG $0xd263e9c5                           // vpacksswb    xmm2, xmm2, xmm2
+	LONG $0x1479e3c4; WORD $0x03d1             // vpextrb    ecx, xmm2, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_34
+	LONG $0x2059e3c4; WORD $0x1714; BYTE $0x03 // vpinsrb    xmm2, xmm4, byte [rdi + rdx], 3
+	LONG $0x025de3c4; WORD $0x0fe2             // vpblendd    ymm4, ymm4, ymm2, 15
+
+LBB0_34:
+	WORD $0x894c; BYTE $0xd9                   // mov    rcx, r11
+	LONG $0x04c98348                           // or    rcx, 4
+	LONG $0x397de3c4; WORD $0x01c7             // vextracti128    xmm7, ymm0, 1
+	LONG $0x6641c1c4; BYTE $0xd5               // vpcmpgtd    xmm2, xmm7, xmm13
+	LONG $0x1479c3c4; WORD $0x00d1             // vpextrb    r9d, xmm2, 0
+	LONG $0x01c1f641                           // test    r9b, 1
+	QUAD $0x0000011024948948                   // mov    qword [rsp + 272], rdx
+	QUAD $0x00000108248c8948                   // mov    qword [rsp + 264], rcx
+	JE   LBB0_36
+	LONG $0x2059e3c4; WORD $0x0f14; BYTE $0x04 // vpinsrb    xmm2, xmm4, byte [rdi + rcx], 4
+	LONG $0x025de3c4; WORD $0x0fe2             // vpblendd    ymm4, ymm4, ymm2, 15
+
+LBB0_36:
+	WORD $0x894d; BYTE $0xdf                   // mov    r15, r11
+	LONG $0x05cf8349                           // or    r15, 5
+	LONG $0xf166fdc5                           // vpcmpgtd    ymm6, ymm0, ymm1
+	LONG $0xd06bcdc5                           // vpackssdw    ymm2, ymm6, ymm0
+	LONG $0x397de3c4; WORD $0x01d2             // vextracti128    xmm2, ymm2, 1
+	LONG $0x5879e2c4; BYTE $0xd2               // vpbroadcastd    xmm2, xmm2
+	LONG $0xd263e9c5                           // vpacksswb    xmm2, xmm2, xmm2
+	LONG $0x1479e3c4; WORD $0x05d1             // vpextrb    ecx, xmm2, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_38
+	LONG $0x2059a3c4; WORD $0x3f14; BYTE $0x05 // vpinsrb    xmm2, xmm4, byte [rdi + r15], 5
+	LONG $0x025de3c4; WORD $0x0fe2             // vpblendd    ymm4, ymm4, ymm2, 15
+
+LBB0_38:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x06cb8348                           // or    rbx, 6
+	LONG $0xd06bcdc5                           // vpackssdw    ymm2, ymm6, ymm0
+	LONG $0x00fde3c4; WORD $0xe8d2             // vpermq    ymm2, ymm2, 232
+	LONG $0xd263e9c5                           // vpacksswb    xmm2, xmm2, xmm2
+	LONG $0x1479e3c4; WORD $0x06d1             // vpextrb    ecx, xmm2, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_40
+	LONG $0x2059e3c4; WORD $0x1f14; BYTE $0x06 // vpinsrb    xmm2, xmm4, byte [rdi + rbx], 6
+	LONG $0x025de3c4; WORD $0x0fe2             // vpblendd    ymm4, ymm4, ymm2, 15
+
+LBB0_40:
+	LONG $0xf372edc5; BYTE $0x03               // vpslld    ymm2, ymm3, 3
+	WORD $0x894c; BYTE $0xd8                   // mov    rax, r11
+	LONG $0x07c88348                           // or    rax, 7
+	LONG $0xe86bcdc5                           // vpackssdw    ymm5, ymm6, ymm0
+	LONG $0x00fde3c4; WORD $0xe8ed             // vpermq    ymm5, ymm5, 232
+	LONG $0xed63d1c5                           // vpacksswb    xmm5, xmm5, xmm5
+	LONG $0x1479e3c4; WORD $0x07e9             // vpextrb    ecx, xmm5, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_42
+	LONG $0x2059e3c4; WORD $0x072c; BYTE $0x07 // vpinsrb    xmm5, xmm4, byte [rdi + rax], 7
+	LONG $0x025de3c4; WORD $0x0fe5             // vpblendd    ymm4, ymm4, ymm5, 15
+
+LBB0_42:
+	WORD $0x894c; BYTE $0xde                   // mov    rsi, r11
+	LONG $0x08ce8348                           // or    rsi, 8
+	LONG $0xea66f9c5                           // vpcmpgtd    xmm5, xmm0, xmm2
+	LONG $0x1479e3c4; WORD $0x00e9             // vpextrb    ecx, xmm5, 0
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_44
+	LONG $0x2059e3c4; WORD $0x372c; BYTE $0x08 // vpinsrb    xmm5, xmm4, byte [rdi + rsi], 8
+	LONG $0x025de3c4; WORD $0x0fe5             // vpblendd    ymm4, ymm4, ymm5, 15
+
+LBB0_44:
+	WORD $0x894c; BYTE $0xda                   // mov    rdx, r11
+	LONG $0x09ca8348                           // or    rdx, 9
+	LONG $0xea66f9c5                           // vpcmpgtd    xmm5, xmm0, xmm2
+	LONG $0xed6bd1c5                           // vpackssdw    xmm5, xmm5, xmm5
+	LONG $0xed63d1c5                           // vpacksswb    xmm5, xmm5, xmm5
+	LONG $0x1479e3c4; WORD $0x09e9             // vpextrb    ecx, xmm5, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000e024948948                   // mov    qword [rsp + 224], rdx
+	JE   LBB0_46
+	LONG $0x2059e3c4; WORD $0x172c; BYTE $0x09 // vpinsrb    xmm5, xmm4, byte [rdi + rdx], 9
+	LONG $0x025de3c4; WORD $0x0fe5             // vpblendd    ymm4, ymm4, ymm5, 15
+
+LBB0_46:
+	WORD $0x894c; BYTE $0xda                   // mov    rdx, r11
+	LONG $0x0aca8348                           // or    rdx, 10
+	LONG $0xea66f9c5                           // vpcmpgtd    xmm5, xmm0, xmm2
+	LONG $0xed6bd1c5                           // vpackssdw    xmm5, xmm5, xmm5
+	LONG $0xed63d1c5                           // vpacksswb    xmm5, xmm5, xmm5
+	LONG $0x1479e3c4; WORD $0x0ae9             // vpextrb    ecx, xmm5, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000340249c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 832], ymm3
+	LONG $0x24748948; BYTE $0x60               // mov    qword [rsp + 96], rsi
+	JE   LBB0_48
+	LONG $0x2059e3c4; WORD $0x172c; BYTE $0x0a // vpinsrb    xmm5, xmm4, byte [rdi + rdx], 10
+	LONG $0x025de3c4; WORD $0x0fe5             // vpblendd    ymm4, ymm4, ymm5, 15
+
+LBB0_48:
+	LONG $0x397de3c4; WORD $0x01d5             // vextracti128    xmm5, ymm2, 1
+	WORD $0x894c; BYTE $0xde                   // mov    rsi, r11
+	LONG $0x0bce8348                           // or    rsi, 11
+	LONG $0xda66f9c5                           // vpcmpgtd    xmm3, xmm0, xmm2
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0xdb63e1c5                           // vpacksswb    xmm3, xmm3, xmm3
+	LONG $0x1479e3c4; WORD $0x0bd9             // vpextrb    ecx, xmm3, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000982494894c                   // mov    qword [rsp + 152], r10
+	QUAD $0x0000012824b4894c                   // mov    qword [rsp + 296], r14
+	LONG $0x247c894c; BYTE $0x68               // mov    qword [rsp + 104], r15
+	QUAD $0x00000120249c8948                   // mov    qword [rsp + 288], rbx
+	QUAD $0x000000e824848948                   // mov    qword [rsp + 232], rax
+	JE   LBB0_50
+	LONG $0x2059e3c4; WORD $0x371c; BYTE $0x0b // vpinsrb    xmm3, xmm4, byte [rdi + rsi], 11
+	LONG $0x025de3c4; WORD $0x0fe3             // vpblendd    ymm4, ymm4, ymm3, 15
+
+LBB0_50:
+	WORD $0x894c; BYTE $0xd9                   // mov    rcx, r11
+	LONG $0x0cc98348                           // or    rcx, 12
+	LONG $0xdd66c1c5                           // vpcmpgtd    xmm3, xmm7, xmm5
+	LONG $0x1479c3c4; WORD $0x00de             // vpextrb    r14d, xmm3, 0
+	LONG $0x01c6f641                           // test    r14b, 1
+	QUAD $0x0000010024b48948                   // mov    qword [rsp + 256], rsi
+	QUAD $0x000000f8248c8948                   // mov    qword [rsp + 248], rcx
+	JE   LBB0_52
+	LONG $0x2059e3c4; WORD $0x0f1c; BYTE $0x0c // vpinsrb    xmm3, xmm4, byte [rdi + rcx], 12
+	LONG $0x025de3c4; WORD $0x0fe3             // vpblendd    ymm4, ymm4, ymm3, 15
+
+LBB0_52:
+	WORD $0x894c; BYTE $0xd8                   // mov    rax, r11
+	LONG $0x0dc88348                           // or    rax, 13
+	LONG $0xfa66fdc5                           // vpcmpgtd    ymm7, ymm0, ymm2
+	LONG $0xd86bc5c5                           // vpackssdw    ymm3, ymm7, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x5879e2c4; BYTE $0xdb               // vpbroadcastd    xmm3, xmm3
+	LONG $0xdb63e1c5                           // vpacksswb    xmm3, xmm3, xmm3
+	LONG $0x1479e3c4; WORD $0x0dd9             // vpextrb    ecx, xmm3, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_54
+	LONG $0x2059e3c4; WORD $0x071c; BYTE $0x0d // vpinsrb    xmm3, xmm4, byte [rdi + rax], 13
+	LONG $0x025de3c4; WORD $0x0fe3             // vpblendd    ymm4, ymm4, ymm3, 15
+
+LBB0_54:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x0ecb8348                           // or    rbx, 14
+	LONG $0xd86bc5c5                           // vpackssdw    ymm3, ymm7, ymm0
+	LONG $0x00fde3c4; WORD $0xe8db             // vpermq    ymm3, ymm3, 232
+	LONG $0xdb63e1c5                           // vpacksswb    xmm3, xmm3, xmm3
+	LONG $0x1479e3c4; WORD $0x0ed9             // vpextrb    ecx, xmm3, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	LONG $0x245c8948; BYTE $0x50               // mov    qword [rsp + 80], rbx
+	JE   LBB0_56
+	LONG $0x2059e3c4; WORD $0x1f1c; BYTE $0x0e // vpinsrb    xmm3, xmm4, byte [rdi + rbx], 14
+	LONG $0x025de3c4; WORD $0x0fe3             // vpblendd    ymm4, ymm4, ymm3, 15
+
+LBB0_56:
+	LONG $0x722dc1c4; WORD $0x03f0             // vpslld    ymm10, ymm8, 3
+	WORD $0x894c; BYTE $0xde                   // mov    rsi, r11
+	LONG $0x0fce8348                           // or    rsi, 15
+	LONG $0xd86bc5c5                           // vpackssdw    ymm3, ymm7, ymm0
+	LONG $0x00fde3c4; WORD $0xe8db             // vpermq    ymm3, ymm3, 232
+	LONG $0xdb63e1c5                           // vpacksswb    xmm3, xmm3, xmm3
+	LONG $0x1479e3c4; WORD $0x0fd9             // vpextrb    ecx, xmm3, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_58
+	LONG $0x2059e3c4; WORD $0x371c; BYTE $0x0f // vpinsrb    xmm3, xmm4, byte [rdi + rsi], 15
+	LONG $0x025de3c4; WORD $0x0fe3             // vpblendd    ymm4, ymm4, ymm3, 15
+
+LBB0_58:
+	WORD $0x894d; BYTE $0xdf                   // mov    r15, r11
+	LONG $0x10cf8349                           // or    r15, 16
+	LONG $0x6679c1c4; BYTE $0xda               // vpcmpgtd    xmm3, xmm0, xmm10
+	LONG $0xd97ef9c5                           // vmovd    ecx, xmm3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	LONG $0x247c894c; BYTE $0x40               // mov    qword [rsp + 64], r15
+	LONG $0x24748948; BYTE $0x48               // mov    qword [rsp + 72], rsi
+	JE   LBB0_60
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061a3c4; WORD $0x3f1c; BYTE $0x00 // vpinsrb    xmm3, xmm3, byte [rdi + r15], 0
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_60:
+	WORD $0x894c; BYTE $0xde                   // mov    rsi, r11
+	LONG $0x11ce8348                           // or    rsi, 17
+	LONG $0x6679c1c4; BYTE $0xda               // vpcmpgtd    xmm3, xmm0, xmm10
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0x00fde3c4; WORD $0xd4db             // vpermq    ymm3, ymm3, 212
+	LONG $0xd863e5c5                           // vpacksswb    ymm3, ymm3, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x01d9             // vpextrb    ecx, xmm3, 1
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_62
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x371c; BYTE $0x01 // vpinsrb    xmm3, xmm3, byte [rdi + rsi], 1
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_62:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x12cb8348                           // or    rbx, 18
+	LONG $0x6679c1c4; BYTE $0xda               // vpcmpgtd    xmm3, xmm0, xmm10
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0x00fde3c4; WORD $0xd4db             // vpermq    ymm3, ymm3, 212
+	LONG $0xd863e5c5                           // vpacksswb    ymm3, ymm3, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x02d9             // vpextrb    ecx, xmm3, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_64
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x02 // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 2
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_64:
+	WORD $0x894d; BYTE $0xdf                   // mov    r15, r11
+	LONG $0x13cf8349                           // or    r15, 19
+	LONG $0x6679c1c4; BYTE $0xda               // vpcmpgtd    xmm3, xmm0, xmm10
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0x00fde3c4; WORD $0xd4db             // vpermq    ymm3, ymm3, 212
+	LONG $0xd863e5c5                           // vpacksswb    ymm3, ymm3, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x03d9             // vpextrb    ecx, xmm3, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x00036024847f7dc5; BYTE $0x00       // vmovdqa    yword [rsp + 864], ymm8
+	JE   LBB0_66
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061a3c4; WORD $0x3f1c; BYTE $0x03 // vpinsrb    xmm3, xmm3, byte [rdi + r15], 3
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_66:
+	WORD $0x894d; BYTE $0xdd                   // mov    r13, r11
+	LONG $0x14cd8349                           // or    r13, 20
+	LONG $0x667d41c4; BYTE $0xc2               // vpcmpgtd    ymm8, ymm0, ymm10
+	LONG $0x6b7dc1c4; BYTE $0xd8               // vpackssdw    ymm3, ymm0, ymm8
+	LONG $0xd863e5c5                           // vpacksswb    ymm3, ymm3, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x04d9             // vpextrb    ecx, xmm3, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	LONG $0x246c894c; BYTE $0x38               // mov    qword [rsp + 56], r13
+	JE   LBB0_68
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061a3c4; WORD $0x2f1c; BYTE $0x04 // vpinsrb    xmm3, xmm3, byte [rdi + r13], 4
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_68:
+	WORD $0x894d; BYTE $0xdd                   // mov    r13, r11
+	LONG $0x15cd8349                           // or    r13, 21
+	LONG $0x6b7dc1c4; BYTE $0xd8               // vpackssdw    ymm3, ymm0, ymm8
+	LONG $0xd863e5c5                           // vpacksswb    ymm3, ymm3, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x05d9             // vpextrb    ecx, xmm3, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x00000080249c8948                   // mov    qword [rsp + 128], rbx
+	JE   LBB0_70
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061a3c4; WORD $0x2f1c; BYTE $0x05 // vpinsrb    xmm3, xmm3, byte [rdi + r13], 5
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_70:
+	WORD $0x894d; BYTE $0xda                   // mov    r10, r11
+	LONG $0x16ca8349                           // or    r10, 22
+	LONG $0x6b7dc1c4; BYTE $0xd8               // vpackssdw    ymm3, ymm0, ymm8
+	LONG $0xd863e5c5                           // vpacksswb    ymm3, ymm3, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x06d9             // vpextrb    ecx, xmm3, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_72
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061a3c4; WORD $0x171c; BYTE $0x06 // vpinsrb    xmm3, xmm3, byte [rdi + r10], 6
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_72:
+	LONG $0x7225c1c4; WORD $0x03f1             // vpslld    ymm11, ymm9, 3
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x17cb8348                           // or    rbx, 23
+	LONG $0x6b7dc1c4; BYTE $0xd8               // vpackssdw    ymm3, ymm0, ymm8
+	LONG $0xd863e5c5                           // vpacksswb    ymm3, ymm3, ymm0
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x07d9             // vpextrb    ecx, xmm3, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000f0249c8948                   // mov    qword [rsp + 240], rbx
+	QUAD $0x000380248c7f7dc5; BYTE $0x00       // vmovdqa    yword [rsp + 896], ymm9
+	JE   LBB0_74
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x07 // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 7
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_74:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x18cb8348                           // or    rbx, 24
+	LONG $0x667d41c4; BYTE $0xcb               // vpcmpgtd    ymm9, ymm0, ymm11
+	LONG $0x00fd43c4; WORD $0x44e1             // vpermq    ymm12, ymm9, 68
+	LONG $0x637dc1c4; BYTE $0xdc               // vpacksswb    ymm3, ymm0, ymm12
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x08d9             // vpextrb    ecx, xmm3, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000d8249c8948                   // mov    qword [rsp + 216], rbx
+	JE   LBB0_76
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x08 // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 8
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_76:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x19cb8348                           // or    rbx, 25
+	LONG $0x6679c1c4; BYTE $0xdb               // vpcmpgtd    xmm3, xmm0, xmm11
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0x00fde3c4; WORD $0xd4db             // vpermq    ymm3, ymm3, 212
+	LONG $0xdb63fdc5                           // vpacksswb    ymm3, ymm0, ymm3
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x09d9             // vpextrb    ecx, xmm3, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000d0249c8948                   // mov    qword [rsp + 208], rbx
+	JE   LBB0_78
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x09 // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 9
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_78:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x1acb8348                           // or    rbx, 26
+	LONG $0x6679c1c4; BYTE $0xdb               // vpcmpgtd    xmm3, xmm0, xmm11
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0x00fde3c4; WORD $0xd4db             // vpermq    ymm3, ymm3, 212
+	LONG $0xdb63fdc5                           // vpacksswb    ymm3, ymm0, ymm3
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x0ad9             // vpextrb    ecx, xmm3, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000c8249c8948                   // mov    qword [rsp + 200], rbx
+	JE   LBB0_80
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0a // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 10
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_80:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x1bcb8348                           // or    rbx, 27
+	LONG $0x6679c1c4; BYTE $0xdb               // vpcmpgtd    xmm3, xmm0, xmm11
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0x00fde3c4; WORD $0xd4db             // vpermq    ymm3, ymm3, 212
+	LONG $0xdb63fdc5                           // vpacksswb    ymm3, ymm0, ymm3
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x0bd9             // vpextrb    ecx, xmm3, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000c0249c8948                   // mov    qword [rsp + 192], rbx
+	QUAD $0x0000009024948948                   // mov    qword [rsp + 144], rdx
+	LONG $0x24448948; BYTE $0x58               // mov    qword [rsp + 88], rax
+	JE   LBB0_82
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0b // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 11
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_82:
+	WORD $0x894c; BYTE $0xda                   // mov    rdx, r11
+	LONG $0x1cca8348                           // or    rdx, 28
+	LONG $0x6b7dc1c4; BYTE $0xd9               // vpackssdw    ymm3, ymm0, ymm9
+	LONG $0xdb63fdc5                           // vpacksswb    ymm3, ymm0, ymm3
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x0cd9             // vpextrb    ecx, xmm3, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_84
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x171c; BYTE $0x0c // vpinsrb    xmm3, xmm3, byte [rdi + rdx], 12
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_84:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x1dcb8348                           // or    rbx, 29
+	LONG $0x6b7dc1c4; BYTE $0xd9               // vpackssdw    ymm3, ymm0, ymm9
+	LONG $0xdb63fdc5                           // vpacksswb    ymm3, ymm0, ymm3
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x0dd9             // vpextrb    ecx, xmm3, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000b0249c8948                   // mov    qword [rsp + 176], rbx
+	JE   LBB0_86
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0d // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 13
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_86:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x1ecb8348                           // or    rbx, 30
+	LONG $0x6b7dc1c4; BYTE $0xd9               // vpackssdw    ymm3, ymm0, ymm9
+	LONG $0xdb63fdc5                           // vpacksswb    ymm3, ymm0, ymm3
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x0ed9             // vpextrb    ecx, xmm3, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000a8249c8948                   // mov    qword [rsp + 168], rbx
+	JE   LBB0_88
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0e // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 14
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_88:
+	WORD $0x894c; BYTE $0xdb                   // mov    rbx, r11
+	LONG $0x1fcb8348                           // or    rbx, 31
+	LONG $0x6b7dc1c4; BYTE $0xd9               // vpackssdw    ymm3, ymm0, ymm9
+	LONG $0xdb63fdc5                           // vpacksswb    ymm3, ymm0, ymm3
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x1479e3c4; WORD $0x0fd9             // vpextrb    ecx, xmm3, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x000000a0249c8948                   // mov    qword [rsp + 160], rbx
+	JE   LBB0_90
+	LONG $0x397de3c4; WORD $0x01e3             // vextracti128    xmm3, ymm4, 1
+	LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0f // vpinsrb    xmm3, xmm3, byte [rdi + rbx], 15
+	LONG $0x385de3c4; WORD $0x01e3             // vinserti128    ymm4, ymm4, xmm3, 1
+
+LBB0_90:
+	LONG $0x357de2c4; BYTE $0xd9               // vpmovzxdq    ymm3, xmm1
+	QUAD $0x000200249c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 512], ymm3
+	QUAD $0x00000080bddb5dc5                   // vpand    ymm15, ymm4, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0xd966f9c5                           // vpcmpgtd    xmm3, xmm0, xmm1
+	LONG $0xd97ef9c5                           // vmovd    ecx, xmm3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_92
+	QUAD $0x000200249c6ffdc5; BYTE $0x00       // vmovdqa    ymm3, yword [rsp + 512]
+	LONG $0x7ef9e1c4; BYTE $0xd9               // vmovq    rcx, xmm3
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x00 // vpextrb    byte [r8 + rcx], xmm15, 0
+
+LBB0_92:
+	LONG $0xd966f9c5                           // vpcmpgtd    xmm3, xmm0, xmm1
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0xdb63e1c5                           // vpacksswb    xmm3, xmm3, xmm3
+	LONG $0x1479e3c4; WORD $0x01d9             // vpextrb    ecx, xmm3, 1
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_94
+	QUAD $0x000200249c6ffdc5; BYTE $0x00       // vmovdqa    ymm3, yword [rsp + 512]
+	LONG $0x16f9e3c4; WORD $0x01d9             // vpextrq    rcx, xmm3, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x01 // vpextrb    byte [r8 + rcx], xmm15, 1
+
+LBB0_94:
+	LONG $0xd966f9c5                           // vpcmpgtd    xmm3, xmm0, xmm1
+	LONG $0xdb6be1c5                           // vpackssdw    xmm3, xmm3, xmm3
+	LONG $0xdb63e1c5                           // vpacksswb    xmm3, xmm3, xmm3
+	LONG $0x1479e3c4; WORD $0x02d9             // vpextrb    ecx, xmm3, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_96
+	QUAD $0x000200249c6ffdc5; BYTE $0x00       // vmovdqa    ymm3, yword [rsp + 512]
+	LONG $0x397de3c4; WORD $0x01db             // vextracti128    xmm3, ymm3, 1
+	LONG $0x7ef9e1c4; BYTE $0xd9               // vmovq    rcx, xmm3
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x02 // vpextrb    byte [r8 + rcx], xmm15, 2
+
+LBB0_96:
+	LONG $0xc966f9c5                           // vpcmpgtd    xmm1, xmm0, xmm1
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x03c9             // vpextrb    ecx, xmm1, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_98
+	QUAD $0x000200248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 512]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x03 // vpextrb    byte [r8 + rcx], xmm15, 3
+
+LBB0_98:
+	LONG $0x357dc2c4; BYTE $0xcd               // vpmovzxdq    ymm1, xmm13
+	QUAD $0x0001e0248c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 480], ymm1
+	LONG $0x01c1f641                           // test    r9b, 1
+	JE   LBB0_100
+	QUAD $0x0001e0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 480]
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x04 // vpextrb    byte [r8 + rcx], xmm15, 4
+
+LBB0_100:
+	LONG $0xc86bcdc5                           // vpackssdw    ymm1, ymm6, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x5879e2c4; BYTE $0xc9               // vpbroadcastd    xmm1, xmm1
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x05c9             // vpextrb    ecx, xmm1, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_102
+	QUAD $0x0001e0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 480]
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x05 // vpextrb    byte [r8 + rcx], xmm15, 5
+
+LBB0_102:
+	LONG $0xc86bcdc5                           // vpackssdw    ymm1, ymm6, ymm0
+	LONG $0x00fde3c4; WORD $0xe8c9             // vpermq    ymm1, ymm1, 232
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x06c9             // vpextrb    ecx, xmm1, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_104
+	QUAD $0x0001e0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 480]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x06 // vpextrb    byte [r8 + rcx], xmm15, 6
+
+LBB0_104:
+	LONG $0xc86bcdc5                           // vpackssdw    ymm1, ymm6, ymm0
+	LONG $0x00fde3c4; WORD $0xe8c9             // vpermq    ymm1, ymm1, 232
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x07c9             // vpextrb    ecx, xmm1, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_106
+	QUAD $0x0001e0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 480]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x07 // vpextrb    byte [r8 + rcx], xmm15, 7
+
+LBB0_106:
+	LONG $0x357de2c4; BYTE $0xca               // vpmovzxdq    ymm1, xmm2
+	QUAD $0x0001c0248c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 448], ymm1
+	LONG $0xca66f9c5                           // vpcmpgtd    xmm1, xmm0, xmm2
+	LONG $0x1479e3c4; WORD $0x00c9             // vpextrb    ecx, xmm1, 0
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_108
+	QUAD $0x0001c0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 448]
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x08 // vpextrb    byte [r8 + rcx], xmm15, 8
+
+LBB0_108:
+	LONG $0xca66f9c5                           // vpcmpgtd    xmm1, xmm0, xmm2
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x09c9             // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_110
+	QUAD $0x0001c0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 448]
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm15, 9
+
+LBB0_110:
+	LONG $0xca66f9c5                           // vpcmpgtd    xmm1, xmm0, xmm2
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x0ac9             // vpextrb    ecx, xmm1, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_112
+	QUAD $0x0001c0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 448]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm15, 10
+
+LBB0_112:
+	LONG $0xca66f9c5                           // vpcmpgtd    xmm1, xmm0, xmm2
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x0bc9             // vpextrb    ecx, xmm1, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_114
+	QUAD $0x0001c0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 448]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm15, 11
+
+LBB0_114:
+	QUAD $0x0000008824b48948                   // mov    qword [rsp + 136], rsi
+	LONG $0x357de2c4; BYTE $0xcd               // vpmovzxdq    ymm1, xmm5
+	QUAD $0x0001a0248c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 416], ymm1
+	LONG $0x01c6f641                           // test    r14b, 1
+	JE   LBB0_116
+	QUAD $0x0001a0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 416]
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm15, 12
+
+LBB0_116:
+	LONG $0xc86bc5c5                           // vpackssdw    ymm1, ymm7, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x5879e2c4; BYTE $0xc9               // vpbroadcastd    xmm1, xmm1
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x0dc9             // vpextrb    ecx, xmm1, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	QUAD $0x00000098248c8b4c                   // mov    r9, qword [rsp + 152]
+	QUAD $0x0000012824b48b48                   // mov    rsi, qword [rsp + 296]
+	LONG $0x24748b4c; BYTE $0x68               // mov    r14, qword [rsp + 104]
+	QUAD $0x0000012024848b48                   // mov    rax, qword [rsp + 288]
+	JE   LBB0_118
+	QUAD $0x0001a0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 416]
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm15, 13
+
+LBB0_118:
+	LONG $0xc86bc5c5                           // vpackssdw    ymm1, ymm7, ymm0
+	LONG $0x00fde3c4; WORD $0xe8c9             // vpermq    ymm1, ymm1, 232
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x0ec9             // vpextrb    ecx, xmm1, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_120
+	QUAD $0x0001a0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 416]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm15, 14
+
+LBB0_120:
+	LONG $0xc86bc5c5                           // vpackssdw    ymm1, ymm7, ymm0
+	LONG $0x00fde3c4; WORD $0xe8c9             // vpermq    ymm1, ymm1, 232
+	LONG $0xc963f1c5                           // vpacksswb    xmm1, xmm1, xmm1
+	LONG $0x1479e3c4; WORD $0x0fc9             // vpextrb    ecx, xmm1, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_122
+	QUAD $0x0001a0248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 416]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x083c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm15, 15
+
+LBB0_122:
+	LONG $0x357dc2c4; BYTE $0xca               // vpmovzxdq    ymm1, xmm10
+	QUAD $0x000180248c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 384], ymm1
+	LONG $0x6679c1c4; BYTE $0xca               // vpcmpgtd    xmm1, xmm0, xmm10
+	LONG $0xc97ef9c5                           // vmovd    ecx, xmm1
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_124
+	QUAD $0x000180248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 384]
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x00 // vpextrb    byte [r8 + rcx], xmm1, 0
+
+LBB0_124:
+	LONG $0x6679c1c4; BYTE $0xca               // vpcmpgtd    xmm1, xmm0, xmm10
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0x00fde3c4; WORD $0xd4c9             // vpermq    ymm1, ymm1, 212
+	LONG $0xc863f5c5                           // vpacksswb    ymm1, ymm1, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x01c9             // vpextrb    ecx, xmm1, 1
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_126
+	QUAD $0x000180248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 384]
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x01 // vpextrb    byte [r8 + rcx], xmm1, 1
+
+LBB0_126:
+	LONG $0x6679c1c4; BYTE $0xca               // vpcmpgtd    xmm1, xmm0, xmm10
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0x00fde3c4; WORD $0xd4c9             // vpermq    ymm1, ymm1, 212
+	LONG $0xc863f5c5                           // vpacksswb    ymm1, ymm1, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x02c9             // vpextrb    ecx, xmm1, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_128
+	QUAD $0x000180248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 384]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x02 // vpextrb    byte [r8 + rcx], xmm1, 2
+
+LBB0_128:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x6679c1c4; BYTE $0xd2               // vpcmpgtd    xmm2, xmm0, xmm10
+	LONG $0xd26be9c5                           // vpackssdw    xmm2, xmm2, xmm2
+	LONG $0x00fde3c4; WORD $0xd4d2             // vpermq    ymm2, ymm2, 212
+	LONG $0xd063edc5                           // vpacksswb    ymm2, ymm2, ymm0
+	LONG $0x397de3c4; WORD $0x01d2             // vextracti128    xmm2, ymm2, 1
+	LONG $0x1479e3c4; WORD $0x03d1             // vpextrb    ecx, xmm2, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_130
+	QUAD $0x00018024946ffdc5; BYTE $0x00       // vmovdqa    ymm2, yword [rsp + 384]
+	LONG $0x397de3c4; WORD $0x01d2             // vextracti128    xmm2, ymm2, 1
+	LONG $0x16f9e3c4; WORD $0x01d1             // vpextrq    rcx, xmm2, 1
+	LONG $0x397d63c4; WORD $0x01fa             // vextracti128    xmm2, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x0814; BYTE $0x03 // vpextrb    byte [r8 + rcx], xmm2, 3
+
+LBB0_130:
+	LONG $0x357de2c4; BYTE $0xc9               // vpmovzxdq    ymm1, xmm1
+	QUAD $0x000160248c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 352], ymm1
+	LONG $0x6b7dc1c4; BYTE $0xc8               // vpackssdw    ymm1, ymm0, ymm8
+	LONG $0xc863f5c5                           // vpacksswb    ymm1, ymm1, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x04c9             // vpextrb    ecx, xmm1, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_132
+	QUAD $0x000160248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 352]
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x04 // vpextrb    byte [r8 + rcx], xmm1, 4
+
+LBB0_132:
+	LONG $0x6b7dc1c4; BYTE $0xc8               // vpackssdw    ymm1, ymm0, ymm8
+	LONG $0xc863f5c5                           // vpacksswb    ymm1, ymm1, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x05c9             // vpextrb    ecx, xmm1, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_134
+	QUAD $0x000160248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 352]
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x05 // vpextrb    byte [r8 + rcx], xmm1, 5
+
+LBB0_134:
+	LONG $0x6b7dc1c4; BYTE $0xc8               // vpackssdw    ymm1, ymm0, ymm8
+	LONG $0xc863f5c5                           // vpacksswb    ymm1, ymm1, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x06c9             // vpextrb    ecx, xmm1, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_136
+	QUAD $0x000160248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 352]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x06 // vpextrb    byte [r8 + rcx], xmm1, 6
+
+LBB0_136:
+	LONG $0x6b7dc1c4; BYTE $0xc8               // vpackssdw    ymm1, ymm0, ymm8
+	LONG $0xc863f5c5                           // vpacksswb    ymm1, ymm1, ymm0
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x07c9             // vpextrb    ecx, xmm1, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_138
+	QUAD $0x000160248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 352]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x07 // vpextrb    byte [r8 + rcx], xmm1, 7
+
+LBB0_138:
+	LONG $0x357dc2c4; BYTE $0xcb               // vpmovzxdq    ymm1, xmm11
+	QUAD $0x000140248c7ffdc5; BYTE $0x00       // vmovdqa    yword [rsp + 320], ymm1
+	LONG $0x637dc1c4; BYTE $0xcc               // vpacksswb    ymm1, ymm0, ymm12
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x08c9             // vpextrb    ecx, xmm1, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_140
+	QUAD $0x000140248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 320]
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x08 // vpextrb    byte [r8 + rcx], xmm1, 8
+
+LBB0_140:
+	LONG $0x6679c1c4; BYTE $0xcb               // vpcmpgtd    xmm1, xmm0, xmm11
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0x00fde3c4; WORD $0xd4c9             // vpermq    ymm1, ymm1, 212
+	LONG $0xc963fdc5                           // vpacksswb    ymm1, ymm0, ymm1
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x09c9             // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_142
+	QUAD $0x000140248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 320]
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+
+LBB0_142:
+	LONG $0x6679c1c4; BYTE $0xcb               // vpcmpgtd    xmm1, xmm0, xmm11
+	LONG $0xc96bf1c5                           // vpackssdw    xmm1, xmm1, xmm1
+	LONG $0x00fde3c4; WORD $0xd4c9             // vpermq    ymm1, ymm1, 212
+	LONG $0xc963fdc5                           // vpacksswb    ymm1, ymm0, ymm1
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x0ac9             // vpextrb    ecx, xmm1, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_144
+	QUAD $0x000140248c6ffdc5; BYTE $0x00       // vmovdqa    ymm1, yword [rsp + 320]
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+
+LBB0_144:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x6679c1c4; BYTE $0xe3               // vpcmpgtd    xmm4, xmm0, xmm11
+	LONG $0xe46bd9c5                           // vpackssdw    xmm4, xmm4, xmm4
+	LONG $0x00fde3c4; WORD $0xd4e4             // vpermq    ymm4, ymm4, 212
+	LONG $0xe463fdc5                           // vpacksswb    ymm4, ymm0, ymm4
+	LONG $0x397de3c4; WORD $0x01e4             // vextracti128    xmm4, ymm4, 1
+	LONG $0x1479e3c4; WORD $0x0be1             // vpextrb    ecx, xmm4, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_146
+	QUAD $0x00014024946ffdc5; BYTE $0x00       // vmovdqa    ymm2, yword [rsp + 320]
+	LONG $0x397de3c4; WORD $0x01d4             // vextracti128    xmm4, ymm2, 1
+	LONG $0x16f9e3c4; WORD $0x01e1             // vpextrq    rcx, xmm4, 1
+	LONG $0x397d63c4; WORD $0x01fc             // vextracti128    xmm4, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x0824; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm4, 11
+
+LBB0_146:
+	LONG $0x357de2c4; BYTE $0xe1               // vpmovzxdq    ymm4, xmm1
+	LONG $0x6b7dc1c4; BYTE $0xc9               // vpackssdw    ymm1, ymm0, ymm9
+	LONG $0xc963fdc5                           // vpacksswb    ymm1, ymm0, ymm1
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x0cc9             // vpextrb    ecx, xmm1, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_148
+	LONG $0x7ef9e1c4; BYTE $0xe1               // vmovq    rcx, xmm4
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+
+LBB0_148:
+	LONG $0x6b7dc1c4; BYTE $0xc9               // vpackssdw    ymm1, ymm0, ymm9
+	LONG $0xc963fdc5                           // vpacksswb    ymm1, ymm0, ymm1
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x0dc9             // vpextrb    ecx, xmm1, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_150
+	LONG $0x16f9e3c4; WORD $0x01e1             // vpextrq    rcx, xmm4, 1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+
+LBB0_150:
+	LONG $0x6b7dc1c4; BYTE $0xc9               // vpackssdw    ymm1, ymm0, ymm9
+	LONG $0xc963fdc5                           // vpacksswb    ymm1, ymm0, ymm1
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x0ec9             // vpextrb    ecx, xmm1, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_152
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+
+LBB0_152:
+	LONG $0x6b7dc1c4; BYTE $0xc9               // vpackssdw    ymm1, ymm0, ymm9
+	LONG $0xc963fdc5                           // vpacksswb    ymm1, ymm0, ymm1
+	LONG $0x397de3c4; WORD $0x01c9             // vextracti128    xmm1, ymm1, 1
+	LONG $0x1479e3c4; WORD $0x0fc9             // vpextrb    ecx, xmm1, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_154
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+
+LBB0_154:
+	LONG $0x6b4dc1c4; BYTE $0xc8         // vpackssdw    ymm1, ymm6, ymm8
+	LONG $0x00fde3c4; WORD $0xd8c9       // vpermq    ymm1, ymm1, 216
+	LONG $0x6b45c1c4; BYTE $0xe9         // vpackssdw    ymm5, ymm7, ymm9
+	LONG $0x00fde3c4; WORD $0xd8ed       // vpermq    ymm5, ymm5, 216
+	LONG $0xcd63f5c5                     // vpacksswb    ymm1, ymm1, ymm5
+	QUAD $0x00030024946ffdc5; BYTE $0x00 // vmovdqa    ymm2, yword [rsp + 768]
+	QUAD $0x00020024bceb6dc5; BYTE $0x00 // vpor    ymm15, ymm2, yword [rsp + 512]
+	QUAD $0x0001e024acebedc5; BYTE $0x00 // vpor    ymm5, ymm2, yword [rsp + 480]
+	QUAD $0x0001802494eb6dc5; BYTE $0x00 // vpor    ymm10, ymm2, yword [rsp + 384]
+	QUAD $0x000160248ceb6dc5; BYTE $0x00 // vpor    ymm9, ymm2, yword [rsp + 352]
+	QUAD $0x0001c024a4eb6dc5; BYTE $0x00 // vpor    ymm12, ymm2, yword [rsp + 448]
+	QUAD $0x0001a0249ceb6dc5; BYTE $0x00 // vpor    ymm11, ymm2, yword [rsp + 416]
+	QUAD $0x0001402484eb6dc5; BYTE $0x00 // vpor    ymm8, ymm2, yword [rsp + 320]
+	LONG $0xfaebddc5                     // vpor    ymm7, ymm4, ymm2
+	LONG $0x463de3c4; WORD $0x31f7       // vperm2i128    ymm6, ymm8, ymm7, 49
+	LONG $0x383d63c4; WORD $0x01ef       // vinserti128    ymm13, ymm8, xmm7, 1
+	LONG $0xf6c694c5; BYTE $0x88         // vshufps    ymm6, ymm13, ymm6, 136
+	LONG $0x461d43c4; WORD $0x31eb       // vperm2i128    ymm13, ymm12, ymm11, 49
+	LONG $0x381d43c4; WORD $0x01f3       // vinserti128    ymm14, ymm12, xmm11, 1
+	LONG $0xc60c41c4; WORD $0x88ed       // vshufps    ymm13, ymm14, ymm13, 136
+	LONG $0x462d43c4; WORD $0x31f1       // vperm2i128    ymm14, ymm10, ymm9, 49
+	LONG $0x382dc3c4; WORD $0x01d1       // vinserti128    ymm2, ymm10, xmm9, 1
+	LONG $0xc66cc1c4; WORD $0x88d6       // vshufps    ymm2, ymm2, ymm14, 136
+	LONG $0x460563c4; WORD $0x31f5       // vperm2i128    ymm14, ymm15, ymm5, 49
+	LONG $0x3805e3c4; WORD $0x01dd       // vinserti128    ymm3, ymm15, xmm5, 1
+	LONG $0xc664c1c4; WORD $0x88de       // vshufps    ymm3, ymm3, ymm14, 136
+	LONG $0xdb66fdc5                     // vpcmpgtd    ymm3, ymm0, ymm3
+	LONG $0xd266fdc5                     // vpcmpgtd    ymm2, ymm0, ymm2
+	LONG $0xd26be5c5                     // vpackssdw    ymm2, ymm3, ymm2
+	LONG $0x667dc1c4; BYTE $0xdd         // vpcmpgtd    ymm3, ymm0, ymm13
+	LONG $0xf666fdc5                     // vpcmpgtd    ymm6, ymm0, ymm6
+	LONG $0xde6be5c5                     // vpackssdw    ymm3, ymm3, ymm6
+	LONG $0x00fde3c4; WORD $0xd8d2       // vpermq    ymm2, ymm2, 216
+	LONG $0x00fde3c4; WORD $0xd8db       // vpermq    ymm3, ymm3, 216
+	LONG $0xd363edc5                     // vpacksswb    ymm2, ymm2, ymm3
+	LONG $0xf1dbedc5                     // vpand    ymm6, ymm2, ymm1
+	LONG $0xf17ef9c5                     // vmovd    ecx, xmm6
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_155
+	LONG $0x787d22c4; WORD $0x1f34       // vpbroadcastb    ymm14, byte [rdi + r11]
+	LONG $0x1479e3c4; WORD $0x01f1       // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JNE  LBB0_661
+
+LBB0_156:
+	QUAD $0x000000e0249c8b48       // mov    rbx, qword [rsp + 224]
+	LONG $0x1479e3c4; WORD $0x02f1 // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_157
+
+LBB0_662:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x02 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 2
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_663
+
+LBB0_158:
+	LONG $0x1479e3c4; WORD $0x04f1 // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_159
+
+LBB0_664:
+	QUAD $0x00000108248c8b48                   // mov    rcx, qword [rsp + 264]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 4
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x05f1             // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_665
+
+LBB0_160:
+	QUAD $0x000000e824b48b48       // mov    rsi, qword [rsp + 232]
+	LONG $0x1479e3c4; WORD $0x06f1 // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_161
+
+LBB0_666:
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb    xmm1, xmm14, byte [rdi + rax], 6
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x07f1             // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_667
+
+LBB0_162:
+	LONG $0x1479e3c4; WORD $0x08f1 // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_163
+
+LBB0_668:
+	LONG $0x24448b48; BYTE $0x60               // mov    rax, qword [rsp + 96]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x08 // vpinsrb    xmm1, xmm14, byte [rdi + rax], 8
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_669
+
+LBB0_164:
+	LONG $0x1479e3c4; WORD $0x0af1 // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_165
+
+LBB0_670:
+	QUAD $0x0000009024848b48                   // mov    rax, qword [rsp + 144]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb    xmm1, xmm14, byte [rdi + rax], 10
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0bf1             // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_671
+
+LBB0_166:
+	LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_167
+
+LBB0_672:
+	QUAD $0x000000f824848b48                   // mov    rax, qword [rsp + 248]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb    xmm1, xmm14, byte [rdi + rax], 12
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0df1             // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_673
+
+LBB0_168:
+	LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_169
+
+LBB0_674:
+	LONG $0x24448b48; BYTE $0x50               // mov    rax, qword [rsp + 80]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb    xmm1, xmm14, byte [rdi + rax], 14
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0ff1             // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_170
+	JMP  LBB0_171
+
+LBB0_155:
+	LONG $0x1479e3c4; WORD $0x01f1 // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_156
+
+LBB0_661:
+	LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x01 // vpinsrb    xmm1, xmm14, byte [rdi + r9], 1
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	QUAD $0x000000e0249c8b48                   // mov    rbx, qword [rsp + 224]
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_662
+
+LBB0_157:
+	LONG $0x1479e3c4; WORD $0x03f1 // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_158
+
+LBB0_663:
+	QUAD $0x00000110248c8b48                   // mov    rcx, qword [rsp + 272]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 3
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_664
+
+LBB0_159:
+	LONG $0x1479e3c4; WORD $0x05f1 // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_160
+
+LBB0_665:
+	LONG $0x2009a3c4; WORD $0x370c; BYTE $0x05 // vpinsrb    xmm1, xmm14, byte [rdi + r14], 5
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	QUAD $0x000000e824b48b48                   // mov    rsi, qword [rsp + 232]
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_666
+
+LBB0_161:
+	LONG $0x1479e3c4; WORD $0x07f1 // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_162
+
+LBB0_667:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x07 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 7
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x08f1             // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_668
+
+LBB0_163:
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_164
+
+LBB0_669:
+	LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x09 // vpinsrb    xmm1, xmm14, byte [rdi + rbx], 9
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0af1             // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_670
+
+LBB0_165:
+	LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_166
+
+LBB0_671:
+	QUAD $0x0000010024848b48                   // mov    rax, qword [rsp + 256]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb    xmm1, xmm14, byte [rdi + rax], 11
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0cf1             // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_672
+
+LBB0_167:
+	LONG $0x1479e3c4; WORD $0x0df1 // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_168
+
+LBB0_673:
+	LONG $0x24448b48; BYTE $0x58               // mov    rax, qword [rsp + 88]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0d // vpinsrb    xmm1, xmm14, byte [rdi + rax], 13
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0ef1             // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_674
+
+LBB0_169:
+	LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_171
+
+LBB0_170:
+	LONG $0x24448b48; BYTE $0x48               // mov    rax, qword [rsp + 72]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0f // vpinsrb    xmm1, xmm14, byte [rdi + rax], 15
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_171:
+	QUAD $0x000000d0248c8b48                   // mov    rcx, qword [rsp + 208]
+	LONG $0x397dc3c4; WORD $0x01f5             // vextracti128    xmm13, ymm6, 1
+	LONG $0xe87e79c5                           // vmovd    eax, xmm13
+	LONG $0x2c244489                           // mov    dword [rsp + 44], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_172
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x24448b48; BYTE $0x40               // mov    rax, qword [rsp + 64]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x00 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 0
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x01e8             // vpextrb    eax, xmm13, 1
+	LONG $0x28244489                           // mov    dword [rsp + 40], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_676
+
+LBB0_173:
+	LONG $0x147963c4; WORD $0x02e8 // vpextrb    eax, xmm13, 2
+	LONG $0x24244489               // mov    dword [rsp + 36], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_174
+
+LBB0_677:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x0000008024848b48                   // mov    rax, qword [rsp + 128]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x02 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 2
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x03e8             // vpextrb    eax, xmm13, 3
+	LONG $0x20244489                           // mov    dword [rsp + 32], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_678
+
+LBB0_175:
+	LONG $0x147963c4; WORD $0x04e8 // vpextrb    eax, xmm13, 4
+	LONG $0x1c244489               // mov    dword [rsp + 28], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_176
+
+LBB0_679:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x24448b48; BYTE $0x38               // mov    rax, qword [rsp + 56]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x04 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 4
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x05e8             // vpextrb    eax, xmm13, 5
+	LONG $0x18244489                           // mov    dword [rsp + 24], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_680
+
+LBB0_177:
+	LONG $0x147963c4; WORD $0x06e8 // vpextrb    eax, xmm13, 6
+	LONG $0x14244489               // mov    dword [rsp + 20], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_178
+
+LBB0_681:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x2071a3c4; WORD $0x170c; BYTE $0x06 // vpinsrb    xmm1, xmm1, byte [rdi + r10], 6
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x07e8             // vpextrb    eax, xmm13, 7
+	LONG $0x3c248489; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 316], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_682
+
+LBB0_179:
+	QUAD $0x000000d824848b48       // mov    rax, qword [rsp + 216]
+	LONG $0x147963c4; WORD $0x08eb // vpextrb    ebx, xmm13, 8
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_181
+
+LBB0_180:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x08 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 8
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_181:
+	LONG $0x147943c4; WORD $0x09e9             // vpextrb    r9d, xmm13, 9
+	LONG $0x01c1f641                           // test    r9b, 1
+	QUAD $0x0000011824ac894c                   // mov    qword [rsp + 280], r13
+	LONG $0x2454894c; BYTE $0x70               // mov    qword [rsp + 112], r10
+	QUAD $0x000000b824948948                   // mov    qword [rsp + 184], rdx
+	JE   LBB0_183
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x09 // vpinsrb    xmm1, xmm1, byte [rdi + rcx], 9
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_183:
+	QUAD $0x000000c824848b48                   // mov    rax, qword [rsp + 200]
+	QUAD $0x000000c0248c8b48                   // mov    rcx, qword [rsp + 192]
+	LONG $0x147943c4; WORD $0x0aed             // vpextrb    r13d, xmm13, 10
+	LONG $0x01c5f641                           // test    r13b, 1
+	JE   LBB0_184
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb    xmm1, xmm1, byte [rdi + rax], 10
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x0be8             // vpextrb    eax, xmm13, 11
+	WORD $0x01a8                               // test    al, 1
+	LONG $0x247c894c; BYTE $0x78               // mov    qword [rsp + 120], r15
+	JNE  LBB0_684
+
+LBB0_185:
+	LONG $0x147943c4; WORD $0x0cef // vpextrb    r15d, xmm13, 12
+	LONG $0x01c7f641               // test    r15b, 1
+	QUAD $0x00000130249c894c       // mov    qword [rsp + 304], r11
+	JE   LBB0_186
+
+LBB0_685:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000b8248c8b48                   // mov    rcx, qword [rsp + 184]
+	LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0c // vpinsrb    xmm1, xmm1, byte [rdi + rcx], 12
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x0dea             // vpextrb    edx, xmm13, 13
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	JNE  LBB0_686
+
+LBB0_187:
+	LONG $0x147963c4; WORD $0x0eee // vpextrb    esi, xmm13, 14
+	LONG $0x01c6f640               // test    sil, 1
+	JE   LBB0_188
+
+LBB0_687:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000a8248c8b48                   // mov    rcx, qword [rsp + 168]
+	LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0e // vpinsrb    xmm1, xmm1, byte [rdi + rcx], 14
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147943c4; WORD $0x0fee             // vpextrb    r14d, xmm13, 15
+	LONG $0x01c6f641                           // test    r14b, 1
+	JNE  LBB0_189
+	JMP  LBB0_190
+
+LBB0_172:
+	LONG $0x147963c4; WORD $0x01e8 // vpextrb    eax, xmm13, 1
+	LONG $0x28244489               // mov    dword [rsp + 40], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_173
+
+LBB0_676:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x0000008824848b48                   // mov    rax, qword [rsp + 136]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x01 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 1
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x02e8             // vpextrb    eax, xmm13, 2
+	LONG $0x24244489                           // mov    dword [rsp + 36], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_677
+
+LBB0_174:
+	LONG $0x147963c4; WORD $0x03e8 // vpextrb    eax, xmm13, 3
+	LONG $0x20244489               // mov    dword [rsp + 32], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_175
+
+LBB0_678:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x2071a3c4; WORD $0x3f0c; BYTE $0x03 // vpinsrb    xmm1, xmm1, byte [rdi + r15], 3
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x04e8             // vpextrb    eax, xmm13, 4
+	LONG $0x1c244489                           // mov    dword [rsp + 28], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_679
+
+LBB0_176:
+	LONG $0x147963c4; WORD $0x05e8 // vpextrb    eax, xmm13, 5
+	LONG $0x18244489               // mov    dword [rsp + 24], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_177
+
+LBB0_680:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x2071a3c4; WORD $0x2f0c; BYTE $0x05 // vpinsrb    xmm1, xmm1, byte [rdi + r13], 5
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x06e8             // vpextrb    eax, xmm13, 6
+	LONG $0x14244489                           // mov    dword [rsp + 20], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_681
+
+LBB0_178:
+	LONG $0x147963c4; WORD $0x07e8             // vpextrb    eax, xmm13, 7
+	LONG $0x3c248489; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 316], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_179
+
+LBB0_682:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000f024848b48                   // mov    rax, qword [rsp + 240]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x07 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 7
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	QUAD $0x000000d824848b48                   // mov    rax, qword [rsp + 216]
+	LONG $0x147963c4; WORD $0x08eb             // vpextrb    ebx, xmm13, 8
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_180
+	JMP  LBB0_181
+
+LBB0_184:
+	LONG $0x147963c4; WORD $0x0be8 // vpextrb    eax, xmm13, 11
+	WORD $0x01a8                   // test    al, 1
+	LONG $0x247c894c; BYTE $0x78   // mov    qword [rsp + 120], r15
+	JE   LBB0_185
+
+LBB0_684:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0b // vpinsrb    xmm1, xmm1, byte [rdi + rcx], 11
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147943c4; WORD $0x0cef             // vpextrb    r15d, xmm13, 12
+	LONG $0x01c7f641                           // test    r15b, 1
+	QUAD $0x00000130249c894c                   // mov    qword [rsp + 304], r11
+	JNE  LBB0_685
+
+LBB0_186:
+	LONG $0x147963c4; WORD $0x0dea // vpextrb    edx, xmm13, 13
+	WORD $0xc2f6; BYTE $0x01       // test    dl, 1
+	JE   LBB0_187
+
+LBB0_686:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000b0248c8b48                   // mov    rcx, qword [rsp + 176]
+	LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0d // vpinsrb    xmm1, xmm1, byte [rdi + rcx], 13
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+	LONG $0x147963c4; WORD $0x0eee             // vpextrb    esi, xmm13, 14
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_687
+
+LBB0_188:
+	LONG $0x147943c4; WORD $0x0fee // vpextrb    r14d, xmm13, 15
+	LONG $0x01c6f641               // test    r14b, 1
+	JE   LBB0_190
+
+LBB0_189:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000a0248c8b48                   // mov    rcx, qword [rsp + 160]
+	LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0f // vpinsrb    xmm1, xmm1, byte [rdi + rcx], 15
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_190:
+	LONG $0x7175c1c4; WORD $0x01d6             // vpsrlw    ymm1, ymm14, 1
+	QUAD $0x00000080b5db75c5                   // vpand    ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0x7e79c1c4; BYTE $0xf2               // vmovd    r10d, xmm6
+	LONG $0x01c2f641                           // test    r10b, 1
+	JE   LBB0_191
+	LONG $0x7ef961c4; BYTE $0xf9               // vmovq    rcx, xmm15
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x00 // vpextrb    byte [r8 + rcx], xmm14, 0
+	LONG $0x1479e3c4; WORD $0x01f1             // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_689
+
+LBB0_192:
+	LONG $0x1479e3c4; WORD $0x02f1 // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_193
+
+LBB0_690:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x02 // vpextrb    byte [r8 + rcx], xmm14, 2
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_691
+
+LBB0_194:
+	LONG $0x1479e3c4; WORD $0x04f1 // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_195
+
+LBB0_692:
+	LONG $0x7ef9e1c4; BYTE $0xe9               // vmovq    rcx, xmm5
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x04 // vpextrb    byte [r8 + rcx], xmm14, 4
+	LONG $0x1479e3c4; WORD $0x05f1             // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_693
+
+LBB0_196:
+	LONG $0x1479e3c4; WORD $0x06f1 // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_197
+
+LBB0_694:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x06 // vpextrb    byte [r8 + rcx], xmm14, 6
+	LONG $0x1479e3c4; WORD $0x07f1             // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_695
+
+LBB0_198:
+	LONG $0x1479e3c4; WORD $0x08f1 // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_199
+
+LBB0_696:
+	LONG $0x7ef961c4; BYTE $0xe1               // vmovq    rcx, xmm12
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x08 // vpextrb    byte [r8 + rcx], xmm14, 8
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_697
+
+LBB0_200:
+	LONG $0x1479e3c4; WORD $0x0af1 // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_201
+
+LBB0_698:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm14, 10
+	LONG $0x1479e3c4; WORD $0x0bf1             // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_699
+
+LBB0_202:
+	LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_203
+
+LBB0_700:
+	LONG $0x7ef961c4; BYTE $0xd9               // vmovq    rcx, xmm11
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm14, 12
+	LONG $0x1479e3c4; WORD $0x0df1             // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_701
+
+LBB0_204:
+	LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_205
+
+LBB0_702:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm14, 14
+	LONG $0x1479e3c4; WORD $0x0ff1             // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_703
+
+LBB0_206:
+	LONG $0x2c2444f6; BYTE $0x01 // test    byte [rsp + 44], 1
+	JE   LBB0_207
+
+LBB0_704:
+	LONG $0x7ef961c4; BYTE $0xd1               // vmovq    rcx, xmm10
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x00 // vpextrb    byte [r8 + rcx], xmm1, 0
+	LONG $0x282444f6; BYTE $0x01               // test    byte [rsp + 40], 1
+	JNE  LBB0_705
+
+LBB0_208:
+	LONG $0x242444f6; BYTE $0x01 // test    byte [rsp + 36], 1
+	JE   LBB0_209
+
+LBB0_706:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x02 // vpextrb    byte [r8 + rcx], xmm1, 2
+	LONG $0x202444f6; BYTE $0x01               // test    byte [rsp + 32], 1
+	JNE  LBB0_707
+
+LBB0_210:
+	LONG $0x1c2444f6; BYTE $0x01 // test    byte [rsp + 28], 1
+	JE   LBB0_211
+
+LBB0_708:
+	LONG $0x7ef961c4; BYTE $0xc9               // vmovq    rcx, xmm9
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x04 // vpextrb    byte [r8 + rcx], xmm1, 4
+	LONG $0x182444f6; BYTE $0x01               // test    byte [rsp + 24], 1
+	JNE  LBB0_709
+
+LBB0_212:
+	LONG $0x142444f6; BYTE $0x01 // test    byte [rsp + 20], 1
+	JE   LBB0_213
+
+LBB0_710:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x06 // vpextrb    byte [r8 + rcx], xmm1, 6
+	QUAD $0x010000013c2484f6                   // test    byte [rsp + 316], 1
+	JNE  LBB0_711
+
+LBB0_214:
+	WORD $0xc3f6; BYTE $0x01 // test    bl, 1
+	JE   LBB0_215
+
+LBB0_712:
+	LONG $0x7ef961c4; BYTE $0xc1               // vmovq    rcx, xmm8
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x08 // vpextrb    byte [r8 + rcx], xmm1, 8
+	LONG $0x01c1f641                           // test    r9b, 1
+	QUAD $0x000000e024948b4c                   // mov    r10, qword [rsp + 224]
+	QUAD $0x00000090249c8b4c                   // mov    r11, qword [rsp + 144]
+	JNE  LBB0_713
+
+LBB0_216:
+	LONG $0x01c5f641         // test    r13b, 1
+	QUAD $0x00000128249c8b48 // mov    rbx, qword [rsp + 296]
+	JE   LBB0_217
+
+LBB0_714:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+	WORD $0x01a8                               // test    al, 1
+	QUAD $0x00000120248c8b4c                   // mov    r9, qword [rsp + 288]
+	QUAD $0x000000e824848b48                   // mov    rax, qword [rsp + 232]
+	JNE  LBB0_715
+
+LBB0_218:
+	LONG $0x01c7f641 // test    r15b, 1
+	JE   LBB0_219
+
+LBB0_716:
+	LONG $0x7ef9e1c4; BYTE $0xf9               // vmovq    rcx, xmm7
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	QUAD $0x0000008824ac8b4c                   // mov    r13, qword [rsp + 136]
+	QUAD $0x0000008024bc8b4c                   // mov    r15, qword [rsp + 128]
+	JNE  LBB0_717
+
+LBB0_220:
+	LONG $0x01c6f640         // test    sil, 1
+	QUAD $0x0000013024948b48 // mov    rdx, qword [rsp + 304]
+	JE   LBB0_221
+
+LBB0_718:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+	LONG $0x01c6f641                           // test    r14b, 1
+	QUAD $0x0000009824b48b48                   // mov    rsi, qword [rsp + 152]
+	JNE  LBB0_222
+	JMP  LBB0_223
+
+LBB0_191:
+	LONG $0x1479e3c4; WORD $0x01f1 // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_192
+
+LBB0_689:
+	LONG $0x16f963c4; WORD $0x01f9             // vpextrq    rcx, xmm15, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x01 // vpextrb    byte [r8 + rcx], xmm14, 1
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_690
+
+LBB0_193:
+	LONG $0x1479e3c4; WORD $0x03f1 // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_194
+
+LBB0_691:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x03 // vpextrb    byte [r8 + rcx], xmm14, 3
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_692
+
+LBB0_195:
+	LONG $0x1479e3c4; WORD $0x05f1 // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_196
+
+LBB0_693:
+	LONG $0x16f9e3c4; WORD $0x01e9             // vpextrq    rcx, xmm5, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x05 // vpextrb    byte [r8 + rcx], xmm14, 5
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_694
+
+LBB0_197:
+	LONG $0x1479e3c4; WORD $0x07f1 // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_198
+
+LBB0_695:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x07 // vpextrb    byte [r8 + rcx], xmm14, 7
+	LONG $0x1479e3c4; WORD $0x08f1             // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_696
+
+LBB0_199:
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_200
+
+LBB0_697:
+	LONG $0x16f963c4; WORD $0x01e1             // vpextrq    rcx, xmm12, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm14, 9
+	LONG $0x1479e3c4; WORD $0x0af1             // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_698
+
+LBB0_201:
+	LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_202
+
+LBB0_699:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm14, 11
+	LONG $0x1479e3c4; WORD $0x0cf1             // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_700
+
+LBB0_203:
+	LONG $0x1479e3c4; WORD $0x0df1 // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_204
+
+LBB0_701:
+	LONG $0x16f963c4; WORD $0x01d9             // vpextrq    rcx, xmm11, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm14, 13
+	LONG $0x1479e3c4; WORD $0x0ef1             // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_702
+
+LBB0_205:
+	LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_206
+
+LBB0_703:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x147943c4; WORD $0x0834; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm14, 15
+	LONG $0x2c2444f6; BYTE $0x01               // test    byte [rsp + 44], 1
+	JNE  LBB0_704
+
+LBB0_207:
+	LONG $0x282444f6; BYTE $0x01 // test    byte [rsp + 40], 1
+	JE   LBB0_208
+
+LBB0_705:
+	LONG $0x16f963c4; WORD $0x01d1             // vpextrq    rcx, xmm10, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x01 // vpextrb    byte [r8 + rcx], xmm1, 1
+	LONG $0x242444f6; BYTE $0x01               // test    byte [rsp + 36], 1
+	JNE  LBB0_706
+
+LBB0_209:
+	LONG $0x202444f6; BYTE $0x01 // test    byte [rsp + 32], 1
+	JE   LBB0_210
+
+LBB0_707:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x03 // vpextrb    byte [r8 + rcx], xmm1, 3
+	LONG $0x1c2444f6; BYTE $0x01               // test    byte [rsp + 28], 1
+	JNE  LBB0_708
+
+LBB0_211:
+	LONG $0x182444f6; BYTE $0x01 // test    byte [rsp + 24], 1
+	JE   LBB0_212
+
+LBB0_709:
+	LONG $0x16f963c4; WORD $0x01c9             // vpextrq    rcx, xmm9, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x05 // vpextrb    byte [r8 + rcx], xmm1, 5
+	LONG $0x142444f6; BYTE $0x01               // test    byte [rsp + 20], 1
+	JNE  LBB0_710
+
+LBB0_213:
+	QUAD $0x010000013c2484f6 // test    byte [rsp + 316], 1
+	JE   LBB0_214
+
+LBB0_711:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x07 // vpextrb    byte [r8 + rcx], xmm1, 7
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_712
+
+LBB0_215:
+	LONG $0x01c1f641         // test    r9b, 1
+	QUAD $0x000000e024948b4c // mov    r10, qword [rsp + 224]
+	QUAD $0x00000090249c8b4c // mov    r11, qword [rsp + 144]
+	JE   LBB0_216
+
+LBB0_713:
+	LONG $0x16f963c4; WORD $0x01c1             // vpextrq    rcx, xmm8, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+	LONG $0x01c5f641                           // test    r13b, 1
+	QUAD $0x00000128249c8b48                   // mov    rbx, qword [rsp + 296]
+	JNE  LBB0_714
+
+LBB0_217:
+	WORD $0x01a8             // test    al, 1
+	QUAD $0x00000120248c8b4c // mov    r9, qword [rsp + 288]
+	QUAD $0x000000e824848b48 // mov    rax, qword [rsp + 232]
+	JE   LBB0_218
+
+LBB0_715:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm1, 11
+	LONG $0x01c7f641                           // test    r15b, 1
+	JNE  LBB0_716
+
+LBB0_219:
+	WORD $0xc2f6; BYTE $0x01 // test    dl, 1
+	QUAD $0x0000008824ac8b4c // mov    r13, qword [rsp + 136]
+	QUAD $0x0000008024bc8b4c // mov    r15, qword [rsp + 128]
+	JE   LBB0_220
+
+LBB0_717:
+	LONG $0x16f9e3c4; WORD $0x01f9             // vpextrq    rcx, xmm7, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+	LONG $0x01c6f640                           // test    sil, 1
+	QUAD $0x0000013024948b48                   // mov    rdx, qword [rsp + 304]
+	JNE  LBB0_718
+
+LBB0_221:
+	LONG $0x01c6f641         // test    r14b, 1
+	QUAD $0x0000009824b48b48 // mov    rsi, qword [rsp + 152]
+	JE   LBB0_223
+
+LBB0_222:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+
+LBB0_223:
+	QUAD $0x0002e0248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 736]
+	QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor    ymm15, ymm1, yword [rsp + 512]
+	QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor    ymm5, ymm1, yword [rsp + 480]
+	QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor    ymm10, ymm1, yword [rsp + 384]
+	QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor    ymm9, ymm1, yword [rsp + 352]
+	QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor    ymm12, ymm1, yword [rsp + 448]
+	QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor    ymm11, ymm1, yword [rsp + 416]
+	QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor    ymm8, ymm1, yword [rsp + 320]
+	LONG $0xf9ebddc5                     // vpor    ymm7, ymm4, ymm1
+	LONG $0x463de3c4; WORD $0x31cf       // vperm2i128    ymm1, ymm8, ymm7, 49
+	LONG $0x383de3c4; WORD $0x01d7       // vinserti128    ymm2, ymm8, xmm7, 1
+	LONG $0xc9c6ecc5; BYTE $0x88         // vshufps    ymm1, ymm2, ymm1, 136
+	LONG $0x461dc3c4; WORD $0x31d3       // vperm2i128    ymm2, ymm12, ymm11, 49
+	LONG $0x381dc3c4; WORD $0x01db       // vinserti128    ymm3, ymm12, xmm11, 1
+	LONG $0xd2c6e4c5; BYTE $0x88         // vshufps    ymm2, ymm3, ymm2, 136
+	LONG $0x462dc3c4; WORD $0x31d9       // vperm2i128    ymm3, ymm10, ymm9, 49
+	LONG $0x382d43c4; WORD $0x01e9       // vinserti128    ymm13, ymm10, xmm9, 1
+	LONG $0xdbc694c5; BYTE $0x88         // vshufps    ymm3, ymm13, ymm3, 136
+	LONG $0x460563c4; WORD $0x31ed       // vperm2i128    ymm13, ymm15, ymm5, 49
+	LONG $0x380563c4; WORD $0x01f5       // vinserti128    ymm14, ymm15, xmm5, 1
+	LONG $0xc60c41c4; WORD $0x88ed       // vshufps    ymm13, ymm14, ymm13, 136
+	LONG $0x667d41c4; BYTE $0xed         // vpcmpgtd    ymm13, ymm0, ymm13
+	LONG $0xdb66fdc5                     // vpcmpgtd    ymm3, ymm0, ymm3
+	LONG $0xdb6b95c5                     // vpackssdw    ymm3, ymm13, ymm3
+	LONG $0xd266fdc5                     // vpcmpgtd    ymm2, ymm0, ymm2
+	LONG $0xc966fdc5                     // vpcmpgtd    ymm1, ymm0, ymm1
+	LONG $0xc96bedc5                     // vpackssdw    ymm1, ymm2, ymm1
+	LONG $0x00fde3c4; WORD $0xd8d3       // vpermq    ymm2, ymm3, 216
+	LONG $0x00fde3c4; WORD $0xd8c9       // vpermq    ymm1, ymm1, 216
+	LONG $0xc963edc5                     // vpacksswb    ymm1, ymm2, ymm1
+	LONG $0xf6dbf5c5                     // vpand    ymm6, ymm1, ymm6
+	LONG $0xf17ef9c5                     // vmovd    ecx, xmm6
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_224
+	LONG $0x787d62c4; WORD $0x1734       // vpbroadcastb    ymm14, byte [rdi + rdx]
+	LONG $0x1479e3c4; WORD $0x01f1       // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JNE  LBB0_720
+
+LBB0_225:
+	LONG $0x24548b48; BYTE $0x68   // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1 // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_227
+
+LBB0_226:
+	LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb    xmm1, xmm14, byte [rdi + rbx], 2
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_227:
+	LONG $0x24748b48; BYTE $0x60               // mov    rsi, qword [rsp + 96]
+	LONG $0x245c8b48; BYTE $0x48               // mov    rbx, qword [rsp + 72]
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_228
+	QUAD $0x00000110248c8b48                   // mov    rcx, qword [rsp + 272]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 3
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_722
+
+LBB0_229:
+	LONG $0x1479e3c4; WORD $0x05f1 // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_230
+
+LBB0_723:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 5
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_724
+
+LBB0_231:
+	LONG $0x1479e3c4; WORD $0x07f1 // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_232
+
+LBB0_725:
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x07 // vpinsrb    xmm1, xmm14, byte [rdi + rax], 7
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x08f1             // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_726
+
+LBB0_233:
+	LONG $0x24548b48; BYTE $0x58   // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_234
+
+LBB0_727:
+	LONG $0x2009a3c4; WORD $0x170c; BYTE $0x09 // vpinsrb    xmm1, xmm14, byte [rdi + r10], 9
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0af1             // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_728
+
+LBB0_235:
+	LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_236
+
+LBB0_729:
+	QUAD $0x0000010024848b48                   // mov    rax, qword [rsp + 256]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb    xmm1, xmm14, byte [rdi + rax], 11
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0cf1             // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_730
+
+LBB0_237:
+	LONG $0x1479e3c4; WORD $0x0df1 // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_239
+
+LBB0_238:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 13
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_239:
+	LONG $0x24448b48; BYTE $0x50               // mov    rax, qword [rsp + 80]
+	LONG $0x24548b48; BYTE $0x40               // mov    rdx, qword [rsp + 64]
+	LONG $0x1479e3c4; WORD $0x0ef1             // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_241
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb    xmm1, xmm14, byte [rdi + rax], 14
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_241:
+	LONG $0x1479e3c4; WORD $0x0ff1             // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_243
+	LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb    xmm1, xmm14, byte [rdi + rbx], 15
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_243:
+	LONG $0x397de3c4; WORD $0x01f1             // vextracti128    xmm1, ymm6, 1
+	LONG $0xc87ef9c5                           // vmovd    eax, xmm1
+	LONG $0x2c244489                           // mov    dword [rsp + 44], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_245
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb    xmm2, xmm2, byte [rdi + rdx], 0
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_245:
+	LONG $0x244c8b48; BYTE $0x38               // mov    rcx, qword [rsp + 56]
+	LONG $0x1479e3c4; WORD $0x01c8             // vpextrb    eax, xmm1, 1
+	LONG $0x28244489                           // mov    dword [rsp + 40], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_247
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x01 // vpinsrb    xmm2, xmm2, byte [rdi + r13], 1
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_247:
+	QUAD $0x0000011824948b48                   // mov    rdx, qword [rsp + 280]
+	LONG $0x24748b48; BYTE $0x70               // mov    rsi, qword [rsp + 112]
+	LONG $0x1479e3c4; WORD $0x02c8             // vpextrb    eax, xmm1, 2
+	LONG $0x24244489                           // mov    dword [rsp + 36], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_249
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x3f14; BYTE $0x02 // vpinsrb    xmm2, xmm2, byte [rdi + r15], 2
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_249:
+	LONG $0x24448b48; BYTE $0x78               // mov    rax, qword [rsp + 120]
+	LONG $0x1479e3c4; WORD $0x03cb             // vpextrb    ebx, xmm1, 3
+	LONG $0x20245c89                           // mov    dword [rsp + 32], ebx
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JE   LBB0_250
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x03 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 3
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x04c8             // vpextrb    eax, xmm1, 4
+	LONG $0x1c244489                           // mov    dword [rsp + 28], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_732
+
+LBB0_251:
+	LONG $0x1479e3c4; WORD $0x05c8 // vpextrb    eax, xmm1, 5
+	LONG $0x18244489               // mov    dword [rsp + 24], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_252
+
+LBB0_733:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1714; BYTE $0x05 // vpinsrb    xmm2, xmm2, byte [rdi + rdx], 5
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x06c8             // vpextrb    eax, xmm1, 6
+	LONG $0x14244489                           // mov    dword [rsp + 20], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_734
+
+LBB0_253:
+	LONG $0x1479c3c4; WORD $0x07c9 // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB0_254
+
+LBB0_735:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000f024848b48                   // mov    rax, qword [rsp + 240]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 7
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x08ca             // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	JNE  LBB0_736
+
+LBB0_255:
+	LONG $0x1479e3c4; WORD $0x09c9 // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_256
+
+LBB0_737:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d024848b48                   // mov    rax, qword [rsp + 208]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 9
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0ace             // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_738
+
+LBB0_257:
+	LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_258
+
+LBB0_739:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c0249c8b48                   // mov    rbx, qword [rsp + 192]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 11
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ccd             // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_740
+
+LBB0_259:
+	LONG $0x1479c3c4; WORD $0x0dca // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641               // test    r10b, 1
+	JE   LBB0_260
+
+LBB0_741:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b0249c8b48                   // mov    rbx, qword [rsp + 176]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 13
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ecb             // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_742
+
+LBB0_261:
+	LONG $0x1479c3c4; WORD $0x0fce // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641               // test    r14b, 1
+	JE   LBB0_263
+
+LBB0_262:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000a0249c8b48                   // mov    rbx, qword [rsp + 160]
+	LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb    xmm1, xmm1, byte [rdi + rbx], 15
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_263:
+	LONG $0x7175c1c4; WORD $0x02d6             // vpsrlw    ymm1, ymm14, 2
+	QUAD $0x00000080b5db75c5                   // vpand    ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0x7e79c1c4; BYTE $0xf7               // vmovd    r15d, xmm6
+	LONG $0x01c7f641                           // test    r15b, 1
+	JE   LBB0_264
+	LONG $0x7ef961c4; BYTE $0xfb               // vmovq    rbx, xmm15
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm14, 0
+	LONG $0x1479e3c4; WORD $0x01f3             // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_744
+
+LBB0_265:
+	LONG $0x1479e3c4; WORD $0x02f3 // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	QUAD $0x000000e024bc8b4c       // mov    r15, qword [rsp + 224]
+	JE   LBB0_266
+
+LBB0_745:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm14, 2
+	LONG $0x1479e3c4; WORD $0x03f3             // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_746
+
+LBB0_267:
+	LONG $0x1479e3c4; WORD $0x04f3 // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_268
+
+LBB0_747:
+	LONG $0x7ef9e1c4; BYTE $0xeb               // vmovq    rbx, xmm5
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm14, 4
+	LONG $0x1479e3c4; WORD $0x05f3             // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_748
+
+LBB0_269:
+	LONG $0x1479e3c4; WORD $0x06f3 // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_270
+
+LBB0_749:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm14, 6
+	LONG $0x1479e3c4; WORD $0x07f3             // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_750
+
+LBB0_271:
+	LONG $0x1479e3c4; WORD $0x08f3 // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_272
+
+LBB0_751:
+	LONG $0x7ef961c4; BYTE $0xe3               // vmovq    rbx, xmm12
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb    byte [r8 + rbx], xmm14, 8
+	LONG $0x1479e3c4; WORD $0x09f3             // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_752
+
+LBB0_273:
+	LONG $0x1479e3c4; WORD $0x0af3 // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_274
+
+LBB0_753:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb    byte [r8 + rbx], xmm14, 10
+	LONG $0x1479e3c4; WORD $0x0bf3             // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_754
+
+LBB0_275:
+	LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_276
+
+LBB0_755:
+	LONG $0x7ef961c4; BYTE $0xdb               // vmovq    rbx, xmm11
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb    byte [r8 + rbx], xmm14, 12
+	LONG $0x1479e3c4; WORD $0x0df3             // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_756
+
+LBB0_277:
+	LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_278
+
+LBB0_757:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb    byte [r8 + rbx], xmm14, 14
+	LONG $0x1479e3c4; WORD $0x0ff3             // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_758
+
+LBB0_279:
+	LONG $0x2c2444f6; BYTE $0x01 // test    byte [rsp + 44], 1
+	JE   LBB0_280
+
+LBB0_759:
+	LONG $0x7ef961c4; BYTE $0xd3               // vmovq    rbx, xmm10
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm1, 0
+	LONG $0x282444f6; BYTE $0x01               // test    byte [rsp + 40], 1
+	JNE  LBB0_760
+
+LBB0_281:
+	LONG $0x242444f6; BYTE $0x01 // test    byte [rsp + 36], 1
+	JE   LBB0_282
+
+LBB0_761:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm1, 2
+	LONG $0x202444f6; BYTE $0x01               // test    byte [rsp + 32], 1
+	JNE  LBB0_762
+
+LBB0_283:
+	LONG $0x1c2444f6; BYTE $0x01 // test    byte [rsp + 28], 1
+	JE   LBB0_284
+
+LBB0_763:
+	LONG $0x7ef961c4; BYTE $0xcb               // vmovq    rbx, xmm9
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm1, 4
+	LONG $0x182444f6; BYTE $0x01               // test    byte [rsp + 24], 1
+	JNE  LBB0_764
+
+LBB0_285:
+	LONG $0x142444f6; BYTE $0x01 // test    byte [rsp + 20], 1
+	JE   LBB0_286
+
+LBB0_765:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm1, 6
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_766
+
+LBB0_287:
+	WORD $0xc2f6; BYTE $0x01 // test    dl, 1
+	QUAD $0x00000128249c8b48 // mov    rbx, qword [rsp + 296]
+	JE   LBB0_288
+
+LBB0_767:
+	LONG $0x7ef961c4; BYTE $0xc2               // vmovq    rdx, xmm8
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb    byte [r8 + rdx], xmm1, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_768
+
+LBB0_289:
+	LONG $0x01c6f640         // test    sil, 1
+	QUAD $0x0000013024948b48 // mov    rdx, qword [rsp + 304]
+	JE   LBB0_290
+
+LBB0_769:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+	WORD $0x01a8                               // test    al, 1
+	QUAD $0x0000009824b48b48                   // mov    rsi, qword [rsp + 152]
+	JNE  LBB0_770
+
+LBB0_291:
+	LONG $0x01c5f641 // test    r13b, 1
+	JE   LBB0_292
+
+LBB0_771:
+	LONG $0x7ef9e1c4; BYTE $0xf9               // vmovq    rcx, xmm7
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+	LONG $0x01c2f641                           // test    r10b, 1
+	QUAD $0x0000011824ac8b4c                   // mov    r13, qword [rsp + 280]
+	JNE  LBB0_772
+
+LBB0_293:
+	LONG $0x01c3f641 // test    r11b, 1
+	JE   LBB0_294
+
+LBB0_773:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+	LONG $0x01c6f641                           // test    r14b, 1
+	QUAD $0x0000012024848b48                   // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c                   // mov    r9, qword [rsp + 232]
+	JNE  LBB0_295
+	JMP  LBB0_296
+
+LBB0_224:
+	LONG $0x1479e3c4; WORD $0x01f1 // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_225
+
+LBB0_720:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 1
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x68               // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_226
+	JMP  LBB0_227
+
+LBB0_228:
+	LONG $0x1479e3c4; WORD $0x04f1 // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_229
+
+LBB0_722:
+	QUAD $0x00000108248c8b48                   // mov    rcx, qword [rsp + 264]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 4
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x05f1             // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_723
+
+LBB0_230:
+	LONG $0x1479e3c4; WORD $0x06f1 // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_231
+
+LBB0_724:
+	LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x06 // vpinsrb    xmm1, xmm14, byte [rdi + r9], 6
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x07f1             // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_725
+
+LBB0_232:
+	LONG $0x1479e3c4; WORD $0x08f1 // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_233
+
+LBB0_726:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 8
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x58               // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_727
+
+LBB0_234:
+	LONG $0x1479e3c4; WORD $0x0af1 // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_235
+
+LBB0_728:
+	LONG $0x2009a3c4; WORD $0x1f0c; BYTE $0x0a // vpinsrb    xmm1, xmm14, byte [rdi + r11], 10
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0bf1             // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_729
+
+LBB0_236:
+	LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_237
+
+LBB0_730:
+	QUAD $0x000000f824848b48                   // mov    rax, qword [rsp + 248]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb    xmm1, xmm14, byte [rdi + rax], 12
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0df1             // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_238
+	JMP  LBB0_239
+
+LBB0_250:
+	LONG $0x1479e3c4; WORD $0x04c8 // vpextrb    eax, xmm1, 4
+	LONG $0x1c244489               // mov    dword [rsp + 28], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_251
+
+LBB0_732:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0f14; BYTE $0x04 // vpinsrb    xmm2, xmm2, byte [rdi + rcx], 4
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x05c8             // vpextrb    eax, xmm1, 5
+	LONG $0x18244489                           // mov    dword [rsp + 24], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_733
+
+LBB0_252:
+	LONG $0x1479e3c4; WORD $0x06c8 // vpextrb    eax, xmm1, 6
+	LONG $0x14244489               // mov    dword [rsp + 20], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_253
+
+LBB0_734:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x3714; BYTE $0x06 // vpinsrb    xmm2, xmm2, byte [rdi + rsi], 6
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x07c9             // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_735
+
+LBB0_254:
+	LONG $0x1479e3c4; WORD $0x08ca // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01       // test    dl, 1
+	JE   LBB0_255
+
+LBB0_736:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d824848b48                   // mov    rax, qword [rsp + 216]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 8
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x09c9             // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_737
+
+LBB0_256:
+	LONG $0x1479e3c4; WORD $0x0ace // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640               // test    sil, 1
+	JE   LBB0_257
+
+LBB0_738:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c824848b48                   // mov    rax, qword [rsp + 200]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb    xmm2, xmm2, byte [rdi + rax], 10
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0bc8             // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_739
+
+LBB0_258:
+	LONG $0x1479c3c4; WORD $0x0ccd // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641               // test    r13b, 1
+	JE   LBB0_259
+
+LBB0_740:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b8249c8b48                   // mov    rbx, qword [rsp + 184]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 12
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0dca             // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641                           // test    r10b, 1
+	JNE  LBB0_741
+
+LBB0_260:
+	LONG $0x1479c3c4; WORD $0x0ecb // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641               // test    r11b, 1
+	JE   LBB0_261
+
+LBB0_742:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000a8249c8b48                   // mov    rbx, qword [rsp + 168]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 14
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0fce             // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641                           // test    r14b, 1
+	JNE  LBB0_262
+	JMP  LBB0_263
+
+LBB0_264:
+	LONG $0x1479e3c4; WORD $0x01f3 // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_265
+
+LBB0_744:
+	LONG $0x16f963c4; WORD $0x01fb             // vpextrq    rbx, xmm15, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm14, 1
+	LONG $0x1479e3c4; WORD $0x02f3             // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	QUAD $0x000000e024bc8b4c                   // mov    r15, qword [rsp + 224]
+	JNE  LBB0_745
+
+LBB0_266:
+	LONG $0x1479e3c4; WORD $0x03f3 // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_267
+
+LBB0_746:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm14, 3
+	LONG $0x1479e3c4; WORD $0x04f3             // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_747
+
+LBB0_268:
+	LONG $0x1479e3c4; WORD $0x05f3 // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_269
+
+LBB0_748:
+	LONG $0x16f9e3c4; WORD $0x01eb             // vpextrq    rbx, xmm5, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm14, 5
+	LONG $0x1479e3c4; WORD $0x06f3             // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_749
+
+LBB0_270:
+	LONG $0x1479e3c4; WORD $0x07f3 // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_271
+
+LBB0_750:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm14, 7
+	LONG $0x1479e3c4; WORD $0x08f3             // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_751
+
+LBB0_272:
+	LONG $0x1479e3c4; WORD $0x09f3 // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_273
+
+LBB0_752:
+	LONG $0x16f963c4; WORD $0x01e3             // vpextrq    rbx, xmm12, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb    byte [r8 + rbx], xmm14, 9
+	LONG $0x1479e3c4; WORD $0x0af3             // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_753
+
+LBB0_274:
+	LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_275
+
+LBB0_754:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb    byte [r8 + rbx], xmm14, 11
+	LONG $0x1479e3c4; WORD $0x0cf3             // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_755
+
+LBB0_276:
+	LONG $0x1479e3c4; WORD $0x0df3 // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_277
+
+LBB0_756:
+	LONG $0x16f963c4; WORD $0x01db             // vpextrq    rbx, xmm11, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb    byte [r8 + rbx], xmm14, 13
+	LONG $0x1479e3c4; WORD $0x0ef3             // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_757
+
+LBB0_278:
+	LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_279
+
+LBB0_758:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb    byte [r8 + rbx], xmm14, 15
+	LONG $0x2c2444f6; BYTE $0x01               // test    byte [rsp + 44], 1
+	JNE  LBB0_759
+
+LBB0_280:
+	LONG $0x282444f6; BYTE $0x01 // test    byte [rsp + 40], 1
+	JE   LBB0_281
+
+LBB0_760:
+	LONG $0x16f963c4; WORD $0x01d3             // vpextrq    rbx, xmm10, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm1, 1
+	LONG $0x242444f6; BYTE $0x01               // test    byte [rsp + 36], 1
+	JNE  LBB0_761
+
+LBB0_282:
+	LONG $0x202444f6; BYTE $0x01 // test    byte [rsp + 32], 1
+	JE   LBB0_283
+
+LBB0_762:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm1, 3
+	LONG $0x1c2444f6; BYTE $0x01               // test    byte [rsp + 28], 1
+	JNE  LBB0_763
+
+LBB0_284:
+	LONG $0x182444f6; BYTE $0x01 // test    byte [rsp + 24], 1
+	JE   LBB0_285
+
+LBB0_764:
+	LONG $0x16f963c4; WORD $0x01cb             // vpextrq    rbx, xmm9, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm1, 5
+	LONG $0x142444f6; BYTE $0x01               // test    byte [rsp + 20], 1
+	JNE  LBB0_765
+
+LBB0_286:
+	LONG $0x01c1f641 // test    r9b, 1
+	JE   LBB0_287
+
+LBB0_766:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm1, 7
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	QUAD $0x00000128249c8b48                   // mov    rbx, qword [rsp + 296]
+	JNE  LBB0_767
+
+LBB0_288:
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB0_289
+
+LBB0_768:
+	LONG $0x16f963c4; WORD $0x01c1             // vpextrq    rcx, xmm8, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+	LONG $0x01c6f640                           // test    sil, 1
+	QUAD $0x0000013024948b48                   // mov    rdx, qword [rsp + 304]
+	JNE  LBB0_769
+
+LBB0_290:
+	WORD $0x01a8             // test    al, 1
+	QUAD $0x0000009824b48b48 // mov    rsi, qword [rsp + 152]
+	JE   LBB0_291
+
+LBB0_770:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm1, 11
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_771
+
+LBB0_292:
+	LONG $0x01c2f641         // test    r10b, 1
+	QUAD $0x0000011824ac8b4c // mov    r13, qword [rsp + 280]
+	JE   LBB0_293
+
+LBB0_772:
+	LONG $0x16f9e3c4; WORD $0x01f9             // vpextrq    rcx, xmm7, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_773
+
+LBB0_294:
+	LONG $0x01c6f641         // test    r14b, 1
+	QUAD $0x0000012024848b48 // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c // mov    r9, qword [rsp + 232]
+	JE   LBB0_296
+
+LBB0_295:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+
+LBB0_296:
+	QUAD $0x0002c0248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 704]
+	QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor    ymm15, ymm1, yword [rsp + 512]
+	QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor    ymm5, ymm1, yword [rsp + 480]
+	QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor    ymm10, ymm1, yword [rsp + 384]
+	QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor    ymm9, ymm1, yword [rsp + 352]
+	QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor    ymm12, ymm1, yword [rsp + 448]
+	QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor    ymm11, ymm1, yword [rsp + 416]
+	QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor    ymm8, ymm1, yword [rsp + 320]
+	LONG $0xf9ebddc5                     // vpor    ymm7, ymm4, ymm1
+	LONG $0x463de3c4; WORD $0x31cf       // vperm2i128    ymm1, ymm8, ymm7, 49
+	LONG $0x383de3c4; WORD $0x01d7       // vinserti128    ymm2, ymm8, xmm7, 1
+	LONG $0xc9c6ecc5; BYTE $0x88         // vshufps    ymm1, ymm2, ymm1, 136
+	LONG $0x461dc3c4; WORD $0x31d3       // vperm2i128    ymm2, ymm12, ymm11, 49
+	LONG $0x381dc3c4; WORD $0x01db       // vinserti128    ymm3, ymm12, xmm11, 1
+	LONG $0xd2c6e4c5; BYTE $0x88         // vshufps    ymm2, ymm3, ymm2, 136
+	LONG $0x462dc3c4; WORD $0x31d9       // vperm2i128    ymm3, ymm10, ymm9, 49
+	LONG $0x382d43c4; WORD $0x01e9       // vinserti128    ymm13, ymm10, xmm9, 1
+	LONG $0xdbc694c5; BYTE $0x88         // vshufps    ymm3, ymm13, ymm3, 136
+	LONG $0x460563c4; WORD $0x31ed       // vperm2i128    ymm13, ymm15, ymm5, 49
+	LONG $0x380563c4; WORD $0x01f5       // vinserti128    ymm14, ymm15, xmm5, 1
+	LONG $0xc60c41c4; WORD $0x88ed       // vshufps    ymm13, ymm14, ymm13, 136
+	LONG $0x667d41c4; BYTE $0xed         // vpcmpgtd    ymm13, ymm0, ymm13
+	LONG $0xdb66fdc5                     // vpcmpgtd    ymm3, ymm0, ymm3
+	LONG $0xdb6b95c5                     // vpackssdw    ymm3, ymm13, ymm3
+	LONG $0xd266fdc5                     // vpcmpgtd    ymm2, ymm0, ymm2
+	LONG $0xc966fdc5                     // vpcmpgtd    ymm1, ymm0, ymm1
+	LONG $0xc96bedc5                     // vpackssdw    ymm1, ymm2, ymm1
+	LONG $0x00fde3c4; WORD $0xd8d3       // vpermq    ymm2, ymm3, 216
+	LONG $0x00fde3c4; WORD $0xd8c9       // vpermq    ymm1, ymm1, 216
+	LONG $0xc963edc5                     // vpacksswb    ymm1, ymm2, ymm1
+	LONG $0xf6dbf5c5                     // vpand    ymm6, ymm1, ymm6
+	LONG $0xf17ef9c5                     // vmovd    ecx, xmm6
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_297
+	LONG $0x787d62c4; WORD $0x1734       // vpbroadcastb    ymm14, byte [rdi + rdx]
+	LONG $0x1479e3c4; WORD $0x01f1       // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JNE  LBB0_775
+
+LBB0_298:
+	LONG $0x24548b48; BYTE $0x68   // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1 // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_300
+
+LBB0_299:
+	LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb    xmm1, xmm14, byte [rdi + rbx], 2
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_300:
+	LONG $0x24748b48; BYTE $0x60               // mov    rsi, qword [rsp + 96]
+	LONG $0x24548b4c; BYTE $0x48               // mov    r10, qword [rsp + 72]
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_301
+	QUAD $0x00000110248c8b48                   // mov    rcx, qword [rsp + 272]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 3
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_777
+
+LBB0_302:
+	LONG $0x1479e3c4; WORD $0x05f1 // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_303
+
+LBB0_778:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 5
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_779
+
+LBB0_304:
+	LONG $0x1479e3c4; WORD $0x07f1 // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_305
+
+LBB0_780:
+	LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb    xmm1, xmm14, byte [rdi + r9], 7
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x08f1             // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_781
+
+LBB0_306:
+	LONG $0x24548b48; BYTE $0x58   // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_308
+
+LBB0_307:
+	LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb    xmm1, xmm14, byte [rdi + r15], 9
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_308:
+	QUAD $0x0000009024848b48                   // mov    rax, qword [rsp + 144]
+	QUAD $0x0000008824b48b48                   // mov    rsi, qword [rsp + 136]
+	QUAD $0x00000080249c8b48                   // mov    rbx, qword [rsp + 128]
+	LONG $0x244c8b4c; BYTE $0x78               // mov    r9, qword [rsp + 120]
+	LONG $0x1479e3c4; WORD $0x0af1             // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_309
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb    xmm1, xmm14, byte [rdi + rax], 10
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0bf1             // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_783
+
+LBB0_310:
+	LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_311
+
+LBB0_784:
+	QUAD $0x000000f824848b48                   // mov    rax, qword [rsp + 248]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb    xmm1, xmm14, byte [rdi + rax], 12
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0df1             // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_312
+	JMP  LBB0_313
+
+LBB0_297:
+	LONG $0x1479e3c4; WORD $0x01f1 // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_298
+
+LBB0_775:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 1
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x68               // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_299
+	JMP  LBB0_300
+
+LBB0_301:
+	LONG $0x1479e3c4; WORD $0x04f1 // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_302
+
+LBB0_777:
+	QUAD $0x00000108248c8b48                   // mov    rcx, qword [rsp + 264]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 4
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x05f1             // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_778
+
+LBB0_303:
+	LONG $0x1479e3c4; WORD $0x06f1 // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_304
+
+LBB0_779:
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb    xmm1, xmm14, byte [rdi + rax], 6
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x07f1             // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_780
+
+LBB0_305:
+	LONG $0x1479e3c4; WORD $0x08f1 // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_306
+
+LBB0_781:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 8
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x58               // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_307
+	JMP  LBB0_308
+
+LBB0_309:
+	LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_310
+
+LBB0_783:
+	QUAD $0x0000010024848b48                   // mov    rax, qword [rsp + 256]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb    xmm1, xmm14, byte [rdi + rax], 11
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0cf1             // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_784
+
+LBB0_311:
+	LONG $0x1479e3c4; WORD $0x0df1 // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_313
+
+LBB0_312:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 13
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_313:
+	LONG $0x24448b48; BYTE $0x50               // mov    rax, qword [rsp + 80]
+	LONG $0x24548b48; BYTE $0x40               // mov    rdx, qword [rsp + 64]
+	LONG $0x1479e3c4; WORD $0x0ef1             // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_315
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb    xmm1, xmm14, byte [rdi + rax], 14
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_315:
+	LONG $0x1479e3c4; WORD $0x0ff1             // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_317
+	LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb    xmm1, xmm14, byte [rdi + r10], 15
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_317:
+	LONG $0x397de3c4; WORD $0x01f1             // vextracti128    xmm1, ymm6, 1
+	LONG $0xc87ef9c5                           // vmovd    eax, xmm1
+	LONG $0x2c244489                           // mov    dword [rsp + 44], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_319
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb    xmm2, xmm2, byte [rdi + rdx], 0
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_319:
+	LONG $0x24448b48; BYTE $0x38               // mov    rax, qword [rsp + 56]
+	LONG $0x1479e3c4; WORD $0x01c9             // vpextrb    ecx, xmm1, 1
+	LONG $0x28244c89                           // mov    dword [rsp + 40], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_320
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb    xmm2, xmm2, byte [rdi + rsi], 1
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x02c9             // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89                           // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_786
+
+LBB0_321:
+	LONG $0x1479e3c4; WORD $0x03c9 // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89               // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_322
+
+LBB0_787:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb    xmm2, xmm2, byte [rdi + r9], 3
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x04c9             // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89                           // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_788
+
+LBB0_323:
+	LONG $0x1479e3c4; WORD $0x05c8 // vpextrb    eax, xmm1, 5
+	LONG $0x18244489               // mov    dword [rsp + 24], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_325
+
+LBB0_324:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb    xmm2, xmm2, byte [rdi + r13], 5
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_325:
+	LONG $0x24448b48; BYTE $0x70               // mov    rax, qword [rsp + 112]
+	LONG $0x1479e3c4; WORD $0x06c9             // vpextrb    ecx, xmm1, 6
+	LONG $0x14244c89                           // mov    dword [rsp + 20], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_326
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 6
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x07c9             // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_790
+
+LBB0_327:
+	LONG $0x1479e3c4; WORD $0x08ca // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01       // test    dl, 1
+	JE   LBB0_328
+
+LBB0_791:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d824848b48                   // mov    rax, qword [rsp + 216]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 8
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x09c9             // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_792
+
+LBB0_329:
+	LONG $0x1479e3c4; WORD $0x0ace // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640               // test    sil, 1
+	JE   LBB0_330
+
+LBB0_793:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c824848b48                   // mov    rax, qword [rsp + 200]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb    xmm2, xmm2, byte [rdi + rax], 10
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0bc8             // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_794
+
+LBB0_331:
+	LONG $0x1479c3c4; WORD $0x0ccd // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641               // test    r13b, 1
+	JE   LBB0_332
+
+LBB0_795:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b8249c8b48                   // mov    rbx, qword [rsp + 184]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 12
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0dca             // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641                           // test    r10b, 1
+	JNE  LBB0_796
+
+LBB0_333:
+	LONG $0x1479c3c4; WORD $0x0ecb // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641               // test    r11b, 1
+	JE   LBB0_334
+
+LBB0_797:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000a8249c8b48                   // mov    rbx, qword [rsp + 168]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 14
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0fce             // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641                           // test    r14b, 1
+	JNE  LBB0_335
+	JMP  LBB0_336
+
+LBB0_320:
+	LONG $0x1479e3c4; WORD $0x02c9 // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89               // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_321
+
+LBB0_786:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 2
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x03c9             // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89                           // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_787
+
+LBB0_322:
+	LONG $0x1479e3c4; WORD $0x04c9 // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89               // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_323
+
+LBB0_788:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 4
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x05c8             // vpextrb    eax, xmm1, 5
+	LONG $0x18244489                           // mov    dword [rsp + 24], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_324
+	JMP  LBB0_325
+
+LBB0_326:
+	LONG $0x1479c3c4; WORD $0x07c9 // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB0_327
+
+LBB0_790:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000f024848b48                   // mov    rax, qword [rsp + 240]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 7
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x08ca             // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	JNE  LBB0_791
+
+LBB0_328:
+	LONG $0x1479e3c4; WORD $0x09c9 // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_329
+
+LBB0_792:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d024848b48                   // mov    rax, qword [rsp + 208]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 9
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0ace             // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_793
+
+LBB0_330:
+	LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_331
+
+LBB0_794:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c0249c8b48                   // mov    rbx, qword [rsp + 192]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 11
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ccd             // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_795
+
+LBB0_332:
+	LONG $0x1479c3c4; WORD $0x0dca // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641               // test    r10b, 1
+	JE   LBB0_333
+
+LBB0_796:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b0249c8b48                   // mov    rbx, qword [rsp + 176]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 13
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ecb             // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_797
+
+LBB0_334:
+	LONG $0x1479c3c4; WORD $0x0fce // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641               // test    r14b, 1
+	JE   LBB0_336
+
+LBB0_335:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000a0249c8b48                   // mov    rbx, qword [rsp + 160]
+	LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb    xmm1, xmm1, byte [rdi + rbx], 15
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_336:
+	LONG $0x7175c1c4; WORD $0x03d6             // vpsrlw    ymm1, ymm14, 3
+	QUAD $0x00000080b5db75c5                   // vpand    ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0x7e79c1c4; BYTE $0xf7               // vmovd    r15d, xmm6
+	LONG $0x01c7f641                           // test    r15b, 1
+	JE   LBB0_337
+	LONG $0x7ef961c4; BYTE $0xfb               // vmovq    rbx, xmm15
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm14, 0
+	LONG $0x1479e3c4; WORD $0x01f3             // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_799
+
+LBB0_338:
+	LONG $0x1479e3c4; WORD $0x02f3 // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	QUAD $0x000000e024bc8b4c       // mov    r15, qword [rsp + 224]
+	JE   LBB0_339
+
+LBB0_800:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm14, 2
+	LONG $0x1479e3c4; WORD $0x03f3             // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_801
+
+LBB0_340:
+	LONG $0x1479e3c4; WORD $0x04f3 // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_341
+
+LBB0_802:
+	LONG $0x7ef9e1c4; BYTE $0xeb               // vmovq    rbx, xmm5
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm14, 4
+	LONG $0x1479e3c4; WORD $0x05f3             // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_803
+
+LBB0_342:
+	LONG $0x1479e3c4; WORD $0x06f3 // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_343
+
+LBB0_804:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm14, 6
+	LONG $0x1479e3c4; WORD $0x07f3             // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_805
+
+LBB0_344:
+	LONG $0x1479e3c4; WORD $0x08f3 // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_345
+
+LBB0_806:
+	LONG $0x7ef961c4; BYTE $0xe3               // vmovq    rbx, xmm12
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb    byte [r8 + rbx], xmm14, 8
+	LONG $0x1479e3c4; WORD $0x09f3             // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_807
+
+LBB0_346:
+	LONG $0x1479e3c4; WORD $0x0af3 // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_347
+
+LBB0_808:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb    byte [r8 + rbx], xmm14, 10
+	LONG $0x1479e3c4; WORD $0x0bf3             // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_809
+
+LBB0_348:
+	LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_349
+
+LBB0_810:
+	LONG $0x7ef961c4; BYTE $0xdb               // vmovq    rbx, xmm11
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb    byte [r8 + rbx], xmm14, 12
+	LONG $0x1479e3c4; WORD $0x0df3             // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_811
+
+LBB0_350:
+	LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_351
+
+LBB0_812:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb    byte [r8 + rbx], xmm14, 14
+	LONG $0x1479e3c4; WORD $0x0ff3             // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_813
+
+LBB0_352:
+	LONG $0x2c2444f6; BYTE $0x01 // test    byte [rsp + 44], 1
+	JE   LBB0_353
+
+LBB0_814:
+	LONG $0x7ef961c4; BYTE $0xd3               // vmovq    rbx, xmm10
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm1, 0
+	LONG $0x282444f6; BYTE $0x01               // test    byte [rsp + 40], 1
+	JNE  LBB0_815
+
+LBB0_354:
+	LONG $0x242444f6; BYTE $0x01 // test    byte [rsp + 36], 1
+	JE   LBB0_355
+
+LBB0_816:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm1, 2
+	LONG $0x202444f6; BYTE $0x01               // test    byte [rsp + 32], 1
+	JNE  LBB0_817
+
+LBB0_356:
+	LONG $0x1c2444f6; BYTE $0x01 // test    byte [rsp + 28], 1
+	JE   LBB0_357
+
+LBB0_818:
+	LONG $0x7ef961c4; BYTE $0xcb               // vmovq    rbx, xmm9
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm1, 4
+	LONG $0x182444f6; BYTE $0x01               // test    byte [rsp + 24], 1
+	JNE  LBB0_819
+
+LBB0_358:
+	LONG $0x142444f6; BYTE $0x01 // test    byte [rsp + 20], 1
+	JE   LBB0_359
+
+LBB0_820:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm1, 6
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_821
+
+LBB0_360:
+	WORD $0xc2f6; BYTE $0x01 // test    dl, 1
+	QUAD $0x00000128249c8b48 // mov    rbx, qword [rsp + 296]
+	JE   LBB0_361
+
+LBB0_822:
+	LONG $0x7ef961c4; BYTE $0xc2               // vmovq    rdx, xmm8
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb    byte [r8 + rdx], xmm1, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_823
+
+LBB0_362:
+	LONG $0x01c6f640         // test    sil, 1
+	QUAD $0x0000013024948b48 // mov    rdx, qword [rsp + 304]
+	JE   LBB0_363
+
+LBB0_824:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+	WORD $0x01a8                               // test    al, 1
+	QUAD $0x0000009824b48b48                   // mov    rsi, qword [rsp + 152]
+	JNE  LBB0_825
+
+LBB0_364:
+	LONG $0x01c5f641 // test    r13b, 1
+	JE   LBB0_365
+
+LBB0_826:
+	LONG $0x7ef9e1c4; BYTE $0xf9               // vmovq    rcx, xmm7
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+	LONG $0x01c2f641                           // test    r10b, 1
+	QUAD $0x0000011824ac8b4c                   // mov    r13, qword [rsp + 280]
+	JNE  LBB0_827
+
+LBB0_366:
+	LONG $0x01c3f641 // test    r11b, 1
+	JE   LBB0_367
+
+LBB0_828:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+	LONG $0x01c6f641                           // test    r14b, 1
+	QUAD $0x0000012024848b48                   // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c                   // mov    r9, qword [rsp + 232]
+	JNE  LBB0_368
+	JMP  LBB0_369
+
+LBB0_337:
+	LONG $0x1479e3c4; WORD $0x01f3 // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_338
+
+LBB0_799:
+	LONG $0x16f963c4; WORD $0x01fb             // vpextrq    rbx, xmm15, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm14, 1
+	LONG $0x1479e3c4; WORD $0x02f3             // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	QUAD $0x000000e024bc8b4c                   // mov    r15, qword [rsp + 224]
+	JNE  LBB0_800
+
+LBB0_339:
+	LONG $0x1479e3c4; WORD $0x03f3 // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_340
+
+LBB0_801:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm14, 3
+	LONG $0x1479e3c4; WORD $0x04f3             // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_802
+
+LBB0_341:
+	LONG $0x1479e3c4; WORD $0x05f3 // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_342
+
+LBB0_803:
+	LONG $0x16f9e3c4; WORD $0x01eb             // vpextrq    rbx, xmm5, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm14, 5
+	LONG $0x1479e3c4; WORD $0x06f3             // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_804
+
+LBB0_343:
+	LONG $0x1479e3c4; WORD $0x07f3 // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_344
+
+LBB0_805:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm14, 7
+	LONG $0x1479e3c4; WORD $0x08f3             // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_806
+
+LBB0_345:
+	LONG $0x1479e3c4; WORD $0x09f3 // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_346
+
+LBB0_807:
+	LONG $0x16f963c4; WORD $0x01e3             // vpextrq    rbx, xmm12, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb    byte [r8 + rbx], xmm14, 9
+	LONG $0x1479e3c4; WORD $0x0af3             // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_808
+
+LBB0_347:
+	LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_348
+
+LBB0_809:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb    byte [r8 + rbx], xmm14, 11
+	LONG $0x1479e3c4; WORD $0x0cf3             // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_810
+
+LBB0_349:
+	LONG $0x1479e3c4; WORD $0x0df3 // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_350
+
+LBB0_811:
+	LONG $0x16f963c4; WORD $0x01db             // vpextrq    rbx, xmm11, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb    byte [r8 + rbx], xmm14, 13
+	LONG $0x1479e3c4; WORD $0x0ef3             // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_812
+
+LBB0_351:
+	LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_352
+
+LBB0_813:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb    byte [r8 + rbx], xmm14, 15
+	LONG $0x2c2444f6; BYTE $0x01               // test    byte [rsp + 44], 1
+	JNE  LBB0_814
+
+LBB0_353:
+	LONG $0x282444f6; BYTE $0x01 // test    byte [rsp + 40], 1
+	JE   LBB0_354
+
+LBB0_815:
+	LONG $0x16f963c4; WORD $0x01d3             // vpextrq    rbx, xmm10, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm1, 1
+	LONG $0x242444f6; BYTE $0x01               // test    byte [rsp + 36], 1
+	JNE  LBB0_816
+
+LBB0_355:
+	LONG $0x202444f6; BYTE $0x01 // test    byte [rsp + 32], 1
+	JE   LBB0_356
+
+LBB0_817:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm1, 3
+	LONG $0x1c2444f6; BYTE $0x01               // test    byte [rsp + 28], 1
+	JNE  LBB0_818
+
+LBB0_357:
+	LONG $0x182444f6; BYTE $0x01 // test    byte [rsp + 24], 1
+	JE   LBB0_358
+
+LBB0_819:
+	LONG $0x16f963c4; WORD $0x01cb             // vpextrq    rbx, xmm9, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm1, 5
+	LONG $0x142444f6; BYTE $0x01               // test    byte [rsp + 20], 1
+	JNE  LBB0_820
+
+LBB0_359:
+	LONG $0x01c1f641 // test    r9b, 1
+	JE   LBB0_360
+
+LBB0_821:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm1, 7
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	QUAD $0x00000128249c8b48                   // mov    rbx, qword [rsp + 296]
+	JNE  LBB0_822
+
+LBB0_361:
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB0_362
+
+LBB0_823:
+	LONG $0x16f963c4; WORD $0x01c1             // vpextrq    rcx, xmm8, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+	LONG $0x01c6f640                           // test    sil, 1
+	QUAD $0x0000013024948b48                   // mov    rdx, qword [rsp + 304]
+	JNE  LBB0_824
+
+LBB0_363:
+	WORD $0x01a8             // test    al, 1
+	QUAD $0x0000009824b48b48 // mov    rsi, qword [rsp + 152]
+	JE   LBB0_364
+
+LBB0_825:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm1, 11
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_826
+
+LBB0_365:
+	LONG $0x01c2f641         // test    r10b, 1
+	QUAD $0x0000011824ac8b4c // mov    r13, qword [rsp + 280]
+	JE   LBB0_366
+
+LBB0_827:
+	LONG $0x16f9e3c4; WORD $0x01f9             // vpextrq    rcx, xmm7, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_828
+
+LBB0_367:
+	LONG $0x01c6f641         // test    r14b, 1
+	QUAD $0x0000012024848b48 // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c // mov    r9, qword [rsp + 232]
+	JE   LBB0_369
+
+LBB0_368:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+
+LBB0_369:
+	QUAD $0x0002a0248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 672]
+	QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor    ymm15, ymm1, yword [rsp + 512]
+	QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor    ymm5, ymm1, yword [rsp + 480]
+	QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor    ymm10, ymm1, yword [rsp + 384]
+	QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor    ymm9, ymm1, yword [rsp + 352]
+	QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor    ymm12, ymm1, yword [rsp + 448]
+	QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor    ymm11, ymm1, yword [rsp + 416]
+	QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor    ymm8, ymm1, yword [rsp + 320]
+	LONG $0xf9ebddc5                     // vpor    ymm7, ymm4, ymm1
+	LONG $0x463de3c4; WORD $0x31cf       // vperm2i128    ymm1, ymm8, ymm7, 49
+	LONG $0x383de3c4; WORD $0x01d7       // vinserti128    ymm2, ymm8, xmm7, 1
+	LONG $0xc9c6ecc5; BYTE $0x88         // vshufps    ymm1, ymm2, ymm1, 136
+	LONG $0x461dc3c4; WORD $0x31d3       // vperm2i128    ymm2, ymm12, ymm11, 49
+	LONG $0x381dc3c4; WORD $0x01db       // vinserti128    ymm3, ymm12, xmm11, 1
+	LONG $0xd2c6e4c5; BYTE $0x88         // vshufps    ymm2, ymm3, ymm2, 136
+	LONG $0x462dc3c4; WORD $0x31d9       // vperm2i128    ymm3, ymm10, ymm9, 49
+	LONG $0x382d43c4; WORD $0x01e9       // vinserti128    ymm13, ymm10, xmm9, 1
+	LONG $0xdbc694c5; BYTE $0x88         // vshufps    ymm3, ymm13, ymm3, 136
+	LONG $0x460563c4; WORD $0x31ed       // vperm2i128    ymm13, ymm15, ymm5, 49
+	LONG $0x380563c4; WORD $0x01f5       // vinserti128    ymm14, ymm15, xmm5, 1
+	LONG $0xc60c41c4; WORD $0x88ed       // vshufps    ymm13, ymm14, ymm13, 136
+	LONG $0x667d41c4; BYTE $0xed         // vpcmpgtd    ymm13, ymm0, ymm13
+	LONG $0xdb66fdc5                     // vpcmpgtd    ymm3, ymm0, ymm3
+	LONG $0xdb6b95c5                     // vpackssdw    ymm3, ymm13, ymm3
+	LONG $0xd266fdc5                     // vpcmpgtd    ymm2, ymm0, ymm2
+	LONG $0xc966fdc5                     // vpcmpgtd    ymm1, ymm0, ymm1
+	LONG $0xc96bedc5                     // vpackssdw    ymm1, ymm2, ymm1
+	LONG $0x00fde3c4; WORD $0xd8d3       // vpermq    ymm2, ymm3, 216
+	LONG $0x00fde3c4; WORD $0xd8c9       // vpermq    ymm1, ymm1, 216
+	LONG $0xc963edc5                     // vpacksswb    ymm1, ymm2, ymm1
+	LONG $0xf6dbf5c5                     // vpand    ymm6, ymm1, ymm6
+	LONG $0xf17ef9c5                     // vmovd    ecx, xmm6
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_370
+	LONG $0x787d62c4; WORD $0x1734       // vpbroadcastb    ymm14, byte [rdi + rdx]
+	LONG $0x1479e3c4; WORD $0x01f1       // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JNE  LBB0_830
+
+LBB0_371:
+	LONG $0x24548b48; BYTE $0x68   // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1 // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_373
+
+LBB0_372:
+	LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb    xmm1, xmm14, byte [rdi + rbx], 2
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_373:
+	LONG $0x24748b48; BYTE $0x60               // mov    rsi, qword [rsp + 96]
+	LONG $0x24548b4c; BYTE $0x48               // mov    r10, qword [rsp + 72]
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_374
+	QUAD $0x00000110248c8b48                   // mov    rcx, qword [rsp + 272]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 3
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_832
+
+LBB0_375:
+	LONG $0x1479e3c4; WORD $0x05f1 // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_376
+
+LBB0_833:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 5
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_834
+
+LBB0_377:
+	LONG $0x1479e3c4; WORD $0x07f1 // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_378
+
+LBB0_835:
+	LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb    xmm1, xmm14, byte [rdi + r9], 7
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x08f1             // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_836
+
+LBB0_379:
+	LONG $0x24548b48; BYTE $0x58   // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_381
+
+LBB0_380:
+	LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb    xmm1, xmm14, byte [rdi + r15], 9
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_381:
+	QUAD $0x0000009024848b48                   // mov    rax, qword [rsp + 144]
+	QUAD $0x0000008824b48b48                   // mov    rsi, qword [rsp + 136]
+	QUAD $0x00000080249c8b48                   // mov    rbx, qword [rsp + 128]
+	LONG $0x244c8b4c; BYTE $0x78               // mov    r9, qword [rsp + 120]
+	LONG $0x1479e3c4; WORD $0x0af1             // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_382
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb    xmm1, xmm14, byte [rdi + rax], 10
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0bf1             // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_838
+
+LBB0_383:
+	LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_384
+
+LBB0_839:
+	QUAD $0x000000f824848b48                   // mov    rax, qword [rsp + 248]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb    xmm1, xmm14, byte [rdi + rax], 12
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0df1             // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_385
+	JMP  LBB0_386
+
+LBB0_370:
+	LONG $0x1479e3c4; WORD $0x01f1 // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_371
+
+LBB0_830:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 1
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x68               // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_372
+	JMP  LBB0_373
+
+LBB0_374:
+	LONG $0x1479e3c4; WORD $0x04f1 // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_375
+
+LBB0_832:
+	QUAD $0x00000108248c8b48                   // mov    rcx, qword [rsp + 264]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 4
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x05f1             // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_833
+
+LBB0_376:
+	LONG $0x1479e3c4; WORD $0x06f1 // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_377
+
+LBB0_834:
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb    xmm1, xmm14, byte [rdi + rax], 6
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x07f1             // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_835
+
+LBB0_378:
+	LONG $0x1479e3c4; WORD $0x08f1 // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_379
+
+LBB0_836:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 8
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x58               // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_380
+	JMP  LBB0_381
+
+LBB0_382:
+	LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_383
+
+LBB0_838:
+	QUAD $0x0000010024848b48                   // mov    rax, qword [rsp + 256]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb    xmm1, xmm14, byte [rdi + rax], 11
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0cf1             // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_839
+
+LBB0_384:
+	LONG $0x1479e3c4; WORD $0x0df1 // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_386
+
+LBB0_385:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 13
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_386:
+	LONG $0x24448b48; BYTE $0x50               // mov    rax, qword [rsp + 80]
+	LONG $0x24548b48; BYTE $0x40               // mov    rdx, qword [rsp + 64]
+	LONG $0x1479e3c4; WORD $0x0ef1             // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_388
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb    xmm1, xmm14, byte [rdi + rax], 14
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_388:
+	LONG $0x1479e3c4; WORD $0x0ff1             // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_390
+	LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb    xmm1, xmm14, byte [rdi + r10], 15
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_390:
+	LONG $0x397de3c4; WORD $0x01f1             // vextracti128    xmm1, ymm6, 1
+	LONG $0xc87ef9c5                           // vmovd    eax, xmm1
+	LONG $0x2c244489                           // mov    dword [rsp + 44], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_392
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb    xmm2, xmm2, byte [rdi + rdx], 0
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_392:
+	LONG $0x24448b48; BYTE $0x38               // mov    rax, qword [rsp + 56]
+	LONG $0x1479e3c4; WORD $0x01c9             // vpextrb    ecx, xmm1, 1
+	LONG $0x28244c89                           // mov    dword [rsp + 40], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_393
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb    xmm2, xmm2, byte [rdi + rsi], 1
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x02c9             // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89                           // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_841
+
+LBB0_394:
+	LONG $0x1479e3c4; WORD $0x03c9 // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89               // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_395
+
+LBB0_842:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb    xmm2, xmm2, byte [rdi + r9], 3
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x04c9             // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89                           // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_843
+
+LBB0_396:
+	LONG $0x1479e3c4; WORD $0x05c8 // vpextrb    eax, xmm1, 5
+	LONG $0x18244489               // mov    dword [rsp + 24], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_398
+
+LBB0_397:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb    xmm2, xmm2, byte [rdi + r13], 5
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_398:
+	LONG $0x24448b48; BYTE $0x70               // mov    rax, qword [rsp + 112]
+	LONG $0x1479e3c4; WORD $0x06c9             // vpextrb    ecx, xmm1, 6
+	LONG $0x14244c89                           // mov    dword [rsp + 20], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_399
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 6
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x07c9             // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_845
+
+LBB0_400:
+	LONG $0x1479e3c4; WORD $0x08ca // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01       // test    dl, 1
+	JE   LBB0_401
+
+LBB0_846:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d824848b48                   // mov    rax, qword [rsp + 216]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 8
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x09c9             // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_847
+
+LBB0_402:
+	LONG $0x1479e3c4; WORD $0x0ace // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640               // test    sil, 1
+	JE   LBB0_403
+
+LBB0_848:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c824848b48                   // mov    rax, qword [rsp + 200]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb    xmm2, xmm2, byte [rdi + rax], 10
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0bc8             // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_849
+
+LBB0_404:
+	LONG $0x1479c3c4; WORD $0x0ccd // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641               // test    r13b, 1
+	JE   LBB0_405
+
+LBB0_850:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b8249c8b48                   // mov    rbx, qword [rsp + 184]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 12
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0dca             // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641                           // test    r10b, 1
+	JNE  LBB0_851
+
+LBB0_406:
+	LONG $0x1479c3c4; WORD $0x0ecb // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641               // test    r11b, 1
+	JE   LBB0_407
+
+LBB0_852:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000a8249c8b48                   // mov    rbx, qword [rsp + 168]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 14
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0fce             // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641                           // test    r14b, 1
+	JNE  LBB0_408
+	JMP  LBB0_409
+
+LBB0_393:
+	LONG $0x1479e3c4; WORD $0x02c9 // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89               // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_394
+
+LBB0_841:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 2
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x03c9             // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89                           // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_842
+
+LBB0_395:
+	LONG $0x1479e3c4; WORD $0x04c9 // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89               // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_396
+
+LBB0_843:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 4
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x05c8             // vpextrb    eax, xmm1, 5
+	LONG $0x18244489                           // mov    dword [rsp + 24], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_397
+	JMP  LBB0_398
+
+LBB0_399:
+	LONG $0x1479c3c4; WORD $0x07c9 // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB0_400
+
+LBB0_845:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000f024848b48                   // mov    rax, qword [rsp + 240]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 7
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x08ca             // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	JNE  LBB0_846
+
+LBB0_401:
+	LONG $0x1479e3c4; WORD $0x09c9 // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_402
+
+LBB0_847:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d024848b48                   // mov    rax, qword [rsp + 208]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 9
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0ace             // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_848
+
+LBB0_403:
+	LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_404
+
+LBB0_849:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c0249c8b48                   // mov    rbx, qword [rsp + 192]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 11
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ccd             // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_850
+
+LBB0_405:
+	LONG $0x1479c3c4; WORD $0x0dca // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641               // test    r10b, 1
+	JE   LBB0_406
+
+LBB0_851:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b0249c8b48                   // mov    rbx, qword [rsp + 176]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 13
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ecb             // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_852
+
+LBB0_407:
+	LONG $0x1479c3c4; WORD $0x0fce // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641               // test    r14b, 1
+	JE   LBB0_409
+
+LBB0_408:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000a0249c8b48                   // mov    rbx, qword [rsp + 160]
+	LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb    xmm1, xmm1, byte [rdi + rbx], 15
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_409:
+	LONG $0x7175c1c4; WORD $0x04d6             // vpsrlw    ymm1, ymm14, 4
+	QUAD $0x00000080b5db75c5                   // vpand    ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0x7e79c1c4; BYTE $0xf7               // vmovd    r15d, xmm6
+	LONG $0x01c7f641                           // test    r15b, 1
+	JE   LBB0_410
+	LONG $0x7ef961c4; BYTE $0xfb               // vmovq    rbx, xmm15
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm14, 0
+	LONG $0x1479e3c4; WORD $0x01f3             // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_854
+
+LBB0_411:
+	LONG $0x1479e3c4; WORD $0x02f3 // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	QUAD $0x000000e024bc8b4c       // mov    r15, qword [rsp + 224]
+	JE   LBB0_412
+
+LBB0_855:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm14, 2
+	LONG $0x1479e3c4; WORD $0x03f3             // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_856
+
+LBB0_413:
+	LONG $0x1479e3c4; WORD $0x04f3 // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_414
+
+LBB0_857:
+	LONG $0x7ef9e1c4; BYTE $0xeb               // vmovq    rbx, xmm5
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm14, 4
+	LONG $0x1479e3c4; WORD $0x05f3             // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_858
+
+LBB0_415:
+	LONG $0x1479e3c4; WORD $0x06f3 // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_416
+
+LBB0_859:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm14, 6
+	LONG $0x1479e3c4; WORD $0x07f3             // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_860
+
+LBB0_417:
+	LONG $0x1479e3c4; WORD $0x08f3 // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_418
+
+LBB0_861:
+	LONG $0x7ef961c4; BYTE $0xe3               // vmovq    rbx, xmm12
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb    byte [r8 + rbx], xmm14, 8
+	LONG $0x1479e3c4; WORD $0x09f3             // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_862
+
+LBB0_419:
+	LONG $0x1479e3c4; WORD $0x0af3 // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_420
+
+LBB0_863:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb    byte [r8 + rbx], xmm14, 10
+	LONG $0x1479e3c4; WORD $0x0bf3             // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_864
+
+LBB0_421:
+	LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_422
+
+LBB0_865:
+	LONG $0x7ef961c4; BYTE $0xdb               // vmovq    rbx, xmm11
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb    byte [r8 + rbx], xmm14, 12
+	LONG $0x1479e3c4; WORD $0x0df3             // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_866
+
+LBB0_423:
+	LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_424
+
+LBB0_867:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb    byte [r8 + rbx], xmm14, 14
+	LONG $0x1479e3c4; WORD $0x0ff3             // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_868
+
+LBB0_425:
+	LONG $0x2c2444f6; BYTE $0x01 // test    byte [rsp + 44], 1
+	JE   LBB0_426
+
+LBB0_869:
+	LONG $0x7ef961c4; BYTE $0xd3               // vmovq    rbx, xmm10
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm1, 0
+	LONG $0x282444f6; BYTE $0x01               // test    byte [rsp + 40], 1
+	JNE  LBB0_870
+
+LBB0_427:
+	LONG $0x242444f6; BYTE $0x01 // test    byte [rsp + 36], 1
+	JE   LBB0_428
+
+LBB0_871:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm1, 2
+	LONG $0x202444f6; BYTE $0x01               // test    byte [rsp + 32], 1
+	JNE  LBB0_872
+
+LBB0_429:
+	LONG $0x1c2444f6; BYTE $0x01 // test    byte [rsp + 28], 1
+	JE   LBB0_430
+
+LBB0_873:
+	LONG $0x7ef961c4; BYTE $0xcb               // vmovq    rbx, xmm9
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm1, 4
+	LONG $0x182444f6; BYTE $0x01               // test    byte [rsp + 24], 1
+	JNE  LBB0_874
+
+LBB0_431:
+	LONG $0x142444f6; BYTE $0x01 // test    byte [rsp + 20], 1
+	JE   LBB0_432
+
+LBB0_875:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm1, 6
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_876
+
+LBB0_433:
+	WORD $0xc2f6; BYTE $0x01 // test    dl, 1
+	QUAD $0x00000128249c8b48 // mov    rbx, qword [rsp + 296]
+	JE   LBB0_434
+
+LBB0_877:
+	LONG $0x7ef961c4; BYTE $0xc2               // vmovq    rdx, xmm8
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb    byte [r8 + rdx], xmm1, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_878
+
+LBB0_435:
+	LONG $0x01c6f640         // test    sil, 1
+	QUAD $0x0000013024948b48 // mov    rdx, qword [rsp + 304]
+	JE   LBB0_436
+
+LBB0_879:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+	WORD $0x01a8                               // test    al, 1
+	QUAD $0x0000009824b48b48                   // mov    rsi, qword [rsp + 152]
+	JNE  LBB0_880
+
+LBB0_437:
+	LONG $0x01c5f641 // test    r13b, 1
+	JE   LBB0_438
+
+LBB0_881:
+	LONG $0x7ef9e1c4; BYTE $0xf9               // vmovq    rcx, xmm7
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+	LONG $0x01c2f641                           // test    r10b, 1
+	QUAD $0x0000011824ac8b4c                   // mov    r13, qword [rsp + 280]
+	JNE  LBB0_882
+
+LBB0_439:
+	LONG $0x01c3f641 // test    r11b, 1
+	JE   LBB0_440
+
+LBB0_883:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+	LONG $0x01c6f641                           // test    r14b, 1
+	QUAD $0x0000012024848b48                   // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c                   // mov    r9, qword [rsp + 232]
+	JNE  LBB0_441
+	JMP  LBB0_442
+
+LBB0_410:
+	LONG $0x1479e3c4; WORD $0x01f3 // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_411
+
+LBB0_854:
+	LONG $0x16f963c4; WORD $0x01fb             // vpextrq    rbx, xmm15, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm14, 1
+	LONG $0x1479e3c4; WORD $0x02f3             // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	QUAD $0x000000e024bc8b4c                   // mov    r15, qword [rsp + 224]
+	JNE  LBB0_855
+
+LBB0_412:
+	LONG $0x1479e3c4; WORD $0x03f3 // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_413
+
+LBB0_856:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm14, 3
+	LONG $0x1479e3c4; WORD $0x04f3             // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_857
+
+LBB0_414:
+	LONG $0x1479e3c4; WORD $0x05f3 // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_415
+
+LBB0_858:
+	LONG $0x16f9e3c4; WORD $0x01eb             // vpextrq    rbx, xmm5, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm14, 5
+	LONG $0x1479e3c4; WORD $0x06f3             // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_859
+
+LBB0_416:
+	LONG $0x1479e3c4; WORD $0x07f3 // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_417
+
+LBB0_860:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm14, 7
+	LONG $0x1479e3c4; WORD $0x08f3             // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_861
+
+LBB0_418:
+	LONG $0x1479e3c4; WORD $0x09f3 // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_419
+
+LBB0_862:
+	LONG $0x16f963c4; WORD $0x01e3             // vpextrq    rbx, xmm12, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb    byte [r8 + rbx], xmm14, 9
+	LONG $0x1479e3c4; WORD $0x0af3             // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_863
+
+LBB0_420:
+	LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_421
+
+LBB0_864:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb    byte [r8 + rbx], xmm14, 11
+	LONG $0x1479e3c4; WORD $0x0cf3             // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_865
+
+LBB0_422:
+	LONG $0x1479e3c4; WORD $0x0df3 // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_423
+
+LBB0_866:
+	LONG $0x16f963c4; WORD $0x01db             // vpextrq    rbx, xmm11, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb    byte [r8 + rbx], xmm14, 13
+	LONG $0x1479e3c4; WORD $0x0ef3             // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_867
+
+LBB0_424:
+	LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_425
+
+LBB0_868:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb    byte [r8 + rbx], xmm14, 15
+	LONG $0x2c2444f6; BYTE $0x01               // test    byte [rsp + 44], 1
+	JNE  LBB0_869
+
+LBB0_426:
+	LONG $0x282444f6; BYTE $0x01 // test    byte [rsp + 40], 1
+	JE   LBB0_427
+
+LBB0_870:
+	LONG $0x16f963c4; WORD $0x01d3             // vpextrq    rbx, xmm10, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm1, 1
+	LONG $0x242444f6; BYTE $0x01               // test    byte [rsp + 36], 1
+	JNE  LBB0_871
+
+LBB0_428:
+	LONG $0x202444f6; BYTE $0x01 // test    byte [rsp + 32], 1
+	JE   LBB0_429
+
+LBB0_872:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm1, 3
+	LONG $0x1c2444f6; BYTE $0x01               // test    byte [rsp + 28], 1
+	JNE  LBB0_873
+
+LBB0_430:
+	LONG $0x182444f6; BYTE $0x01 // test    byte [rsp + 24], 1
+	JE   LBB0_431
+
+LBB0_874:
+	LONG $0x16f963c4; WORD $0x01cb             // vpextrq    rbx, xmm9, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm1, 5
+	LONG $0x142444f6; BYTE $0x01               // test    byte [rsp + 20], 1
+	JNE  LBB0_875
+
+LBB0_432:
+	LONG $0x01c1f641 // test    r9b, 1
+	JE   LBB0_433
+
+LBB0_876:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm1, 7
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	QUAD $0x00000128249c8b48                   // mov    rbx, qword [rsp + 296]
+	JNE  LBB0_877
+
+LBB0_434:
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB0_435
+
+LBB0_878:
+	LONG $0x16f963c4; WORD $0x01c1             // vpextrq    rcx, xmm8, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+	LONG $0x01c6f640                           // test    sil, 1
+	QUAD $0x0000013024948b48                   // mov    rdx, qword [rsp + 304]
+	JNE  LBB0_879
+
+LBB0_436:
+	WORD $0x01a8             // test    al, 1
+	QUAD $0x0000009824b48b48 // mov    rsi, qword [rsp + 152]
+	JE   LBB0_437
+
+LBB0_880:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm1, 11
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_881
+
+LBB0_438:
+	LONG $0x01c2f641         // test    r10b, 1
+	QUAD $0x0000011824ac8b4c // mov    r13, qword [rsp + 280]
+	JE   LBB0_439
+
+LBB0_882:
+	LONG $0x16f9e3c4; WORD $0x01f9             // vpextrq    rcx, xmm7, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_883
+
+LBB0_440:
+	LONG $0x01c6f641         // test    r14b, 1
+	QUAD $0x0000012024848b48 // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c // mov    r9, qword [rsp + 232]
+	JE   LBB0_442
+
+LBB0_441:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+
+LBB0_442:
+	QUAD $0x000280248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 640]
+	QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor    ymm15, ymm1, yword [rsp + 512]
+	QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor    ymm5, ymm1, yword [rsp + 480]
+	QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor    ymm10, ymm1, yword [rsp + 384]
+	QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor    ymm9, ymm1, yword [rsp + 352]
+	QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor    ymm12, ymm1, yword [rsp + 448]
+	QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor    ymm11, ymm1, yword [rsp + 416]
+	QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor    ymm8, ymm1, yword [rsp + 320]
+	LONG $0xf9ebddc5                     // vpor    ymm7, ymm4, ymm1
+	LONG $0x463de3c4; WORD $0x31cf       // vperm2i128    ymm1, ymm8, ymm7, 49
+	LONG $0x383de3c4; WORD $0x01d7       // vinserti128    ymm2, ymm8, xmm7, 1
+	LONG $0xc9c6ecc5; BYTE $0x88         // vshufps    ymm1, ymm2, ymm1, 136
+	LONG $0x461dc3c4; WORD $0x31d3       // vperm2i128    ymm2, ymm12, ymm11, 49
+	LONG $0x381dc3c4; WORD $0x01db       // vinserti128    ymm3, ymm12, xmm11, 1
+	LONG $0xd2c6e4c5; BYTE $0x88         // vshufps    ymm2, ymm3, ymm2, 136
+	LONG $0x462dc3c4; WORD $0x31d9       // vperm2i128    ymm3, ymm10, ymm9, 49
+	LONG $0x382d43c4; WORD $0x01e9       // vinserti128    ymm13, ymm10, xmm9, 1
+	LONG $0xdbc694c5; BYTE $0x88         // vshufps    ymm3, ymm13, ymm3, 136
+	LONG $0x460563c4; WORD $0x31ed       // vperm2i128    ymm13, ymm15, ymm5, 49
+	LONG $0x380563c4; WORD $0x01f5       // vinserti128    ymm14, ymm15, xmm5, 1
+	LONG $0xc60c41c4; WORD $0x88ed       // vshufps    ymm13, ymm14, ymm13, 136
+	LONG $0x667d41c4; BYTE $0xed         // vpcmpgtd    ymm13, ymm0, ymm13
+	LONG $0xdb66fdc5                     // vpcmpgtd    ymm3, ymm0, ymm3
+	LONG $0xdb6b95c5                     // vpackssdw    ymm3, ymm13, ymm3
+	LONG $0xd266fdc5                     // vpcmpgtd    ymm2, ymm0, ymm2
+	LONG $0xc966fdc5                     // vpcmpgtd    ymm1, ymm0, ymm1
+	LONG $0xc96bedc5                     // vpackssdw    ymm1, ymm2, ymm1
+	LONG $0x00fde3c4; WORD $0xd8d3       // vpermq    ymm2, ymm3, 216
+	LONG $0x00fde3c4; WORD $0xd8c9       // vpermq    ymm1, ymm1, 216
+	LONG $0xc963edc5                     // vpacksswb    ymm1, ymm2, ymm1
+	LONG $0xf6dbf5c5                     // vpand    ymm6, ymm1, ymm6
+	LONG $0xf17ef9c5                     // vmovd    ecx, xmm6
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_443
+	LONG $0x787d62c4; WORD $0x1734       // vpbroadcastb    ymm14, byte [rdi + rdx]
+	LONG $0x1479e3c4; WORD $0x01f1       // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JNE  LBB0_885
+
+LBB0_444:
+	LONG $0x24548b48; BYTE $0x68   // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1 // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_446
+
+LBB0_445:
+	LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb    xmm1, xmm14, byte [rdi + rbx], 2
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_446:
+	LONG $0x24748b48; BYTE $0x60               // mov    rsi, qword [rsp + 96]
+	LONG $0x24548b4c; BYTE $0x48               // mov    r10, qword [rsp + 72]
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_447
+	QUAD $0x00000110248c8b48                   // mov    rcx, qword [rsp + 272]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 3
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_887
+
+LBB0_448:
+	LONG $0x1479e3c4; WORD $0x05f1 // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_449
+
+LBB0_888:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 5
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_889
+
+LBB0_450:
+	LONG $0x1479e3c4; WORD $0x07f1 // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_451
+
+LBB0_890:
+	LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb    xmm1, xmm14, byte [rdi + r9], 7
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x08f1             // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_891
+
+LBB0_452:
+	LONG $0x24548b48; BYTE $0x58   // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_454
+
+LBB0_453:
+	LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb    xmm1, xmm14, byte [rdi + r15], 9
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_454:
+	QUAD $0x0000009024848b48                   // mov    rax, qword [rsp + 144]
+	QUAD $0x0000008824b48b48                   // mov    rsi, qword [rsp + 136]
+	QUAD $0x00000080249c8b48                   // mov    rbx, qword [rsp + 128]
+	LONG $0x244c8b4c; BYTE $0x78               // mov    r9, qword [rsp + 120]
+	LONG $0x1479e3c4; WORD $0x0af1             // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_455
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb    xmm1, xmm14, byte [rdi + rax], 10
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0bf1             // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_893
+
+LBB0_456:
+	LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_457
+
+LBB0_894:
+	QUAD $0x000000f824848b48                   // mov    rax, qword [rsp + 248]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb    xmm1, xmm14, byte [rdi + rax], 12
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0df1             // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_458
+	JMP  LBB0_459
+
+LBB0_443:
+	LONG $0x1479e3c4; WORD $0x01f1 // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_444
+
+LBB0_885:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 1
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x68               // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_445
+	JMP  LBB0_446
+
+LBB0_447:
+	LONG $0x1479e3c4; WORD $0x04f1 // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_448
+
+LBB0_887:
+	QUAD $0x00000108248c8b48                   // mov    rcx, qword [rsp + 264]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 4
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x05f1             // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_888
+
+LBB0_449:
+	LONG $0x1479e3c4; WORD $0x06f1 // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_450
+
+LBB0_889:
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb    xmm1, xmm14, byte [rdi + rax], 6
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x07f1             // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_890
+
+LBB0_451:
+	LONG $0x1479e3c4; WORD $0x08f1 // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_452
+
+LBB0_891:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 8
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x58               // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_453
+	JMP  LBB0_454
+
+LBB0_455:
+	LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_456
+
+LBB0_893:
+	QUAD $0x0000010024848b48                   // mov    rax, qword [rsp + 256]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb    xmm1, xmm14, byte [rdi + rax], 11
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0cf1             // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_894
+
+LBB0_457:
+	LONG $0x1479e3c4; WORD $0x0df1 // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_459
+
+LBB0_458:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 13
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_459:
+	LONG $0x24448b48; BYTE $0x50               // mov    rax, qword [rsp + 80]
+	LONG $0x24548b48; BYTE $0x40               // mov    rdx, qword [rsp + 64]
+	LONG $0x1479e3c4; WORD $0x0ef1             // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_461
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb    xmm1, xmm14, byte [rdi + rax], 14
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_461:
+	LONG $0x1479e3c4; WORD $0x0ff1             // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_463
+	LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb    xmm1, xmm14, byte [rdi + r10], 15
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_463:
+	LONG $0x397de3c4; WORD $0x01f1             // vextracti128    xmm1, ymm6, 1
+	LONG $0xc87ef9c5                           // vmovd    eax, xmm1
+	LONG $0x2c244489                           // mov    dword [rsp + 44], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_465
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb    xmm2, xmm2, byte [rdi + rdx], 0
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_465:
+	LONG $0x24448b48; BYTE $0x38               // mov    rax, qword [rsp + 56]
+	LONG $0x1479e3c4; WORD $0x01c9             // vpextrb    ecx, xmm1, 1
+	LONG $0x28244c89                           // mov    dword [rsp + 40], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_466
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb    xmm2, xmm2, byte [rdi + rsi], 1
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x02c9             // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89                           // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_896
+
+LBB0_467:
+	LONG $0x1479e3c4; WORD $0x03c9 // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89               // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_468
+
+LBB0_897:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb    xmm2, xmm2, byte [rdi + r9], 3
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x04c9             // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89                           // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_898
+
+LBB0_469:
+	LONG $0x1479e3c4; WORD $0x05c8 // vpextrb    eax, xmm1, 5
+	LONG $0x18244489               // mov    dword [rsp + 24], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_471
+
+LBB0_470:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb    xmm2, xmm2, byte [rdi + r13], 5
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_471:
+	LONG $0x24448b48; BYTE $0x70               // mov    rax, qword [rsp + 112]
+	LONG $0x1479e3c4; WORD $0x06c9             // vpextrb    ecx, xmm1, 6
+	LONG $0x14244c89                           // mov    dword [rsp + 20], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_472
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 6
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x07c9             // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_900
+
+LBB0_473:
+	LONG $0x1479e3c4; WORD $0x08ca // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01       // test    dl, 1
+	JE   LBB0_474
+
+LBB0_901:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d824848b48                   // mov    rax, qword [rsp + 216]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 8
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x09c9             // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_902
+
+LBB0_475:
+	LONG $0x1479e3c4; WORD $0x0ace // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640               // test    sil, 1
+	JE   LBB0_476
+
+LBB0_903:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c824848b48                   // mov    rax, qword [rsp + 200]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb    xmm2, xmm2, byte [rdi + rax], 10
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0bc8             // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_904
+
+LBB0_477:
+	LONG $0x1479c3c4; WORD $0x0ccd // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641               // test    r13b, 1
+	JE   LBB0_478
+
+LBB0_905:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b8249c8b48                   // mov    rbx, qword [rsp + 184]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 12
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0dca             // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641                           // test    r10b, 1
+	JNE  LBB0_906
+
+LBB0_479:
+	LONG $0x1479c3c4; WORD $0x0ecb // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641               // test    r11b, 1
+	JE   LBB0_480
+
+LBB0_907:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000a8249c8b48                   // mov    rbx, qword [rsp + 168]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 14
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0fce             // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641                           // test    r14b, 1
+	JNE  LBB0_481
+	JMP  LBB0_482
+
+LBB0_466:
+	LONG $0x1479e3c4; WORD $0x02c9 // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89               // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_467
+
+LBB0_896:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 2
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x03c9             // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89                           // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_897
+
+LBB0_468:
+	LONG $0x1479e3c4; WORD $0x04c9 // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89               // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_469
+
+LBB0_898:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 4
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x05c8             // vpextrb    eax, xmm1, 5
+	LONG $0x18244489                           // mov    dword [rsp + 24], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_470
+	JMP  LBB0_471
+
+LBB0_472:
+	LONG $0x1479c3c4; WORD $0x07c9 // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB0_473
+
+LBB0_900:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000f024848b48                   // mov    rax, qword [rsp + 240]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 7
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x08ca             // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	JNE  LBB0_901
+
+LBB0_474:
+	LONG $0x1479e3c4; WORD $0x09c9 // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_475
+
+LBB0_902:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d024848b48                   // mov    rax, qword [rsp + 208]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 9
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0ace             // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_903
+
+LBB0_476:
+	LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_477
+
+LBB0_904:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c0249c8b48                   // mov    rbx, qword [rsp + 192]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 11
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ccd             // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_905
+
+LBB0_478:
+	LONG $0x1479c3c4; WORD $0x0dca // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641               // test    r10b, 1
+	JE   LBB0_479
+
+LBB0_906:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b0249c8b48                   // mov    rbx, qword [rsp + 176]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 13
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ecb             // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_907
+
+LBB0_480:
+	LONG $0x1479c3c4; WORD $0x0fce // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641               // test    r14b, 1
+	JE   LBB0_482
+
+LBB0_481:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000a0249c8b48                   // mov    rbx, qword [rsp + 160]
+	LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb    xmm1, xmm1, byte [rdi + rbx], 15
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_482:
+	LONG $0x7175c1c4; WORD $0x05d6             // vpsrlw    ymm1, ymm14, 5
+	QUAD $0x00000080b5db75c5                   // vpand    ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0x7e79c1c4; BYTE $0xf7               // vmovd    r15d, xmm6
+	LONG $0x01c7f641                           // test    r15b, 1
+	JE   LBB0_483
+	LONG $0x7ef961c4; BYTE $0xfb               // vmovq    rbx, xmm15
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm14, 0
+	LONG $0x1479e3c4; WORD $0x01f3             // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_909
+
+LBB0_484:
+	LONG $0x1479e3c4; WORD $0x02f3 // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	QUAD $0x000000e024bc8b4c       // mov    r15, qword [rsp + 224]
+	JE   LBB0_485
+
+LBB0_910:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm14, 2
+	LONG $0x1479e3c4; WORD $0x03f3             // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_911
+
+LBB0_486:
+	LONG $0x1479e3c4; WORD $0x04f3 // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_487
+
+LBB0_912:
+	LONG $0x7ef9e1c4; BYTE $0xeb               // vmovq    rbx, xmm5
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm14, 4
+	LONG $0x1479e3c4; WORD $0x05f3             // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_913
+
+LBB0_488:
+	LONG $0x1479e3c4; WORD $0x06f3 // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_489
+
+LBB0_914:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm14, 6
+	LONG $0x1479e3c4; WORD $0x07f3             // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_915
+
+LBB0_490:
+	LONG $0x1479e3c4; WORD $0x08f3 // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_491
+
+LBB0_916:
+	LONG $0x7ef961c4; BYTE $0xe3               // vmovq    rbx, xmm12
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb    byte [r8 + rbx], xmm14, 8
+	LONG $0x1479e3c4; WORD $0x09f3             // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_917
+
+LBB0_492:
+	LONG $0x1479e3c4; WORD $0x0af3 // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_493
+
+LBB0_918:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb    byte [r8 + rbx], xmm14, 10
+	LONG $0x1479e3c4; WORD $0x0bf3             // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_919
+
+LBB0_494:
+	LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_495
+
+LBB0_920:
+	LONG $0x7ef961c4; BYTE $0xdb               // vmovq    rbx, xmm11
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb    byte [r8 + rbx], xmm14, 12
+	LONG $0x1479e3c4; WORD $0x0df3             // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_921
+
+LBB0_496:
+	LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_497
+
+LBB0_922:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb    byte [r8 + rbx], xmm14, 14
+	LONG $0x1479e3c4; WORD $0x0ff3             // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_923
+
+LBB0_498:
+	LONG $0x2c2444f6; BYTE $0x01 // test    byte [rsp + 44], 1
+	JE   LBB0_499
+
+LBB0_924:
+	LONG $0x7ef961c4; BYTE $0xd3               // vmovq    rbx, xmm10
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm1, 0
+	LONG $0x282444f6; BYTE $0x01               // test    byte [rsp + 40], 1
+	JNE  LBB0_925
+
+LBB0_500:
+	LONG $0x242444f6; BYTE $0x01 // test    byte [rsp + 36], 1
+	JE   LBB0_501
+
+LBB0_926:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm1, 2
+	LONG $0x202444f6; BYTE $0x01               // test    byte [rsp + 32], 1
+	JNE  LBB0_927
+
+LBB0_502:
+	LONG $0x1c2444f6; BYTE $0x01 // test    byte [rsp + 28], 1
+	JE   LBB0_503
+
+LBB0_928:
+	LONG $0x7ef961c4; BYTE $0xcb               // vmovq    rbx, xmm9
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm1, 4
+	LONG $0x182444f6; BYTE $0x01               // test    byte [rsp + 24], 1
+	JNE  LBB0_929
+
+LBB0_504:
+	LONG $0x142444f6; BYTE $0x01 // test    byte [rsp + 20], 1
+	JE   LBB0_505
+
+LBB0_930:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm1, 6
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_931
+
+LBB0_506:
+	WORD $0xc2f6; BYTE $0x01 // test    dl, 1
+	QUAD $0x00000128249c8b48 // mov    rbx, qword [rsp + 296]
+	JE   LBB0_507
+
+LBB0_932:
+	LONG $0x7ef961c4; BYTE $0xc2               // vmovq    rdx, xmm8
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb    byte [r8 + rdx], xmm1, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_933
+
+LBB0_508:
+	LONG $0x01c6f640         // test    sil, 1
+	QUAD $0x0000013024948b48 // mov    rdx, qword [rsp + 304]
+	JE   LBB0_509
+
+LBB0_934:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+	WORD $0x01a8                               // test    al, 1
+	QUAD $0x0000009824b48b48                   // mov    rsi, qword [rsp + 152]
+	JNE  LBB0_935
+
+LBB0_510:
+	LONG $0x01c5f641 // test    r13b, 1
+	JE   LBB0_511
+
+LBB0_936:
+	LONG $0x7ef9e1c4; BYTE $0xf9               // vmovq    rcx, xmm7
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+	LONG $0x01c2f641                           // test    r10b, 1
+	QUAD $0x0000011824ac8b4c                   // mov    r13, qword [rsp + 280]
+	JNE  LBB0_937
+
+LBB0_512:
+	LONG $0x01c3f641 // test    r11b, 1
+	JE   LBB0_513
+
+LBB0_938:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+	LONG $0x01c6f641                           // test    r14b, 1
+	QUAD $0x0000012024848b48                   // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c                   // mov    r9, qword [rsp + 232]
+	JNE  LBB0_514
+	JMP  LBB0_515
+
+LBB0_483:
+	LONG $0x1479e3c4; WORD $0x01f3 // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_484
+
+LBB0_909:
+	LONG $0x16f963c4; WORD $0x01fb             // vpextrq    rbx, xmm15, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm14, 1
+	LONG $0x1479e3c4; WORD $0x02f3             // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	QUAD $0x000000e024bc8b4c                   // mov    r15, qword [rsp + 224]
+	JNE  LBB0_910
+
+LBB0_485:
+	LONG $0x1479e3c4; WORD $0x03f3 // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_486
+
+LBB0_911:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm14, 3
+	LONG $0x1479e3c4; WORD $0x04f3             // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_912
+
+LBB0_487:
+	LONG $0x1479e3c4; WORD $0x05f3 // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_488
+
+LBB0_913:
+	LONG $0x16f9e3c4; WORD $0x01eb             // vpextrq    rbx, xmm5, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm14, 5
+	LONG $0x1479e3c4; WORD $0x06f3             // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_914
+
+LBB0_489:
+	LONG $0x1479e3c4; WORD $0x07f3 // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_490
+
+LBB0_915:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm14, 7
+	LONG $0x1479e3c4; WORD $0x08f3             // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_916
+
+LBB0_491:
+	LONG $0x1479e3c4; WORD $0x09f3 // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_492
+
+LBB0_917:
+	LONG $0x16f963c4; WORD $0x01e3             // vpextrq    rbx, xmm12, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb    byte [r8 + rbx], xmm14, 9
+	LONG $0x1479e3c4; WORD $0x0af3             // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_918
+
+LBB0_493:
+	LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_494
+
+LBB0_919:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb    byte [r8 + rbx], xmm14, 11
+	LONG $0x1479e3c4; WORD $0x0cf3             // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_920
+
+LBB0_495:
+	LONG $0x1479e3c4; WORD $0x0df3 // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_496
+
+LBB0_921:
+	LONG $0x16f963c4; WORD $0x01db             // vpextrq    rbx, xmm11, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb    byte [r8 + rbx], xmm14, 13
+	LONG $0x1479e3c4; WORD $0x0ef3             // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_922
+
+LBB0_497:
+	LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_498
+
+LBB0_923:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb    byte [r8 + rbx], xmm14, 15
+	LONG $0x2c2444f6; BYTE $0x01               // test    byte [rsp + 44], 1
+	JNE  LBB0_924
+
+LBB0_499:
+	LONG $0x282444f6; BYTE $0x01 // test    byte [rsp + 40], 1
+	JE   LBB0_500
+
+LBB0_925:
+	LONG $0x16f963c4; WORD $0x01d3             // vpextrq    rbx, xmm10, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm1, 1
+	LONG $0x242444f6; BYTE $0x01               // test    byte [rsp + 36], 1
+	JNE  LBB0_926
+
+LBB0_501:
+	LONG $0x202444f6; BYTE $0x01 // test    byte [rsp + 32], 1
+	JE   LBB0_502
+
+LBB0_927:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm1, 3
+	LONG $0x1c2444f6; BYTE $0x01               // test    byte [rsp + 28], 1
+	JNE  LBB0_928
+
+LBB0_503:
+	LONG $0x182444f6; BYTE $0x01 // test    byte [rsp + 24], 1
+	JE   LBB0_504
+
+LBB0_929:
+	LONG $0x16f963c4; WORD $0x01cb             // vpextrq    rbx, xmm9, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm1, 5
+	LONG $0x142444f6; BYTE $0x01               // test    byte [rsp + 20], 1
+	JNE  LBB0_930
+
+LBB0_505:
+	LONG $0x01c1f641 // test    r9b, 1
+	JE   LBB0_506
+
+LBB0_931:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm1, 7
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	QUAD $0x00000128249c8b48                   // mov    rbx, qword [rsp + 296]
+	JNE  LBB0_932
+
+LBB0_507:
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB0_508
+
+LBB0_933:
+	LONG $0x16f963c4; WORD $0x01c1             // vpextrq    rcx, xmm8, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+	LONG $0x01c6f640                           // test    sil, 1
+	QUAD $0x0000013024948b48                   // mov    rdx, qword [rsp + 304]
+	JNE  LBB0_934
+
+LBB0_509:
+	WORD $0x01a8             // test    al, 1
+	QUAD $0x0000009824b48b48 // mov    rsi, qword [rsp + 152]
+	JE   LBB0_510
+
+LBB0_935:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm1, 11
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_936
+
+LBB0_511:
+	LONG $0x01c2f641         // test    r10b, 1
+	QUAD $0x0000011824ac8b4c // mov    r13, qword [rsp + 280]
+	JE   LBB0_512
+
+LBB0_937:
+	LONG $0x16f9e3c4; WORD $0x01f9             // vpextrq    rcx, xmm7, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_938
+
+LBB0_513:
+	LONG $0x01c6f641         // test    r14b, 1
+	QUAD $0x0000012024848b48 // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c // mov    r9, qword [rsp + 232]
+	JE   LBB0_515
+
+LBB0_514:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+
+LBB0_515:
+	QUAD $0x000260248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 608]
+	QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor    ymm15, ymm1, yword [rsp + 512]
+	QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor    ymm5, ymm1, yword [rsp + 480]
+	QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor    ymm10, ymm1, yword [rsp + 384]
+	QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor    ymm9, ymm1, yword [rsp + 352]
+	QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor    ymm12, ymm1, yword [rsp + 448]
+	QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor    ymm11, ymm1, yword [rsp + 416]
+	QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor    ymm8, ymm1, yword [rsp + 320]
+	LONG $0xf9ebddc5                     // vpor    ymm7, ymm4, ymm1
+	LONG $0x463de3c4; WORD $0x31cf       // vperm2i128    ymm1, ymm8, ymm7, 49
+	LONG $0x383de3c4; WORD $0x01d7       // vinserti128    ymm2, ymm8, xmm7, 1
+	LONG $0xc9c6ecc5; BYTE $0x88         // vshufps    ymm1, ymm2, ymm1, 136
+	LONG $0x461dc3c4; WORD $0x31d3       // vperm2i128    ymm2, ymm12, ymm11, 49
+	LONG $0x381dc3c4; WORD $0x01db       // vinserti128    ymm3, ymm12, xmm11, 1
+	LONG $0xd2c6e4c5; BYTE $0x88         // vshufps    ymm2, ymm3, ymm2, 136
+	LONG $0x462dc3c4; WORD $0x31d9       // vperm2i128    ymm3, ymm10, ymm9, 49
+	LONG $0x382d43c4; WORD $0x01e9       // vinserti128    ymm13, ymm10, xmm9, 1
+	LONG $0xdbc694c5; BYTE $0x88         // vshufps    ymm3, ymm13, ymm3, 136
+	LONG $0x460563c4; WORD $0x31ed       // vperm2i128    ymm13, ymm15, ymm5, 49
+	LONG $0x380563c4; WORD $0x01f5       // vinserti128    ymm14, ymm15, xmm5, 1
+	LONG $0xc60c41c4; WORD $0x88ed       // vshufps    ymm13, ymm14, ymm13, 136
+	LONG $0x667d41c4; BYTE $0xed         // vpcmpgtd    ymm13, ymm0, ymm13
+	LONG $0xdb66fdc5                     // vpcmpgtd    ymm3, ymm0, ymm3
+	LONG $0xdb6b95c5                     // vpackssdw    ymm3, ymm13, ymm3
+	LONG $0xd266fdc5                     // vpcmpgtd    ymm2, ymm0, ymm2
+	LONG $0xc966fdc5                     // vpcmpgtd    ymm1, ymm0, ymm1
+	LONG $0xc96bedc5                     // vpackssdw    ymm1, ymm2, ymm1
+	LONG $0x00fde3c4; WORD $0xd8d3       // vpermq    ymm2, ymm3, 216
+	LONG $0x00fde3c4; WORD $0xd8c9       // vpermq    ymm1, ymm1, 216
+	LONG $0xc963edc5                     // vpacksswb    ymm1, ymm2, ymm1
+	LONG $0xf6dbf5c5                     // vpand    ymm6, ymm1, ymm6
+	LONG $0xf17ef9c5                     // vmovd    ecx, xmm6
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_516
+	LONG $0x787d62c4; WORD $0x1734       // vpbroadcastb    ymm14, byte [rdi + rdx]
+	LONG $0x1479e3c4; WORD $0x01f1       // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JNE  LBB0_940
+
+LBB0_517:
+	LONG $0x24548b48; BYTE $0x68   // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1 // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_519
+
+LBB0_518:
+	LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb    xmm1, xmm14, byte [rdi + rbx], 2
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_519:
+	LONG $0x24748b48; BYTE $0x60               // mov    rsi, qword [rsp + 96]
+	LONG $0x24548b4c; BYTE $0x48               // mov    r10, qword [rsp + 72]
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_520
+	QUAD $0x00000110248c8b48                   // mov    rcx, qword [rsp + 272]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 3
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_942
+
+LBB0_521:
+	LONG $0x1479e3c4; WORD $0x05f1 // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_522
+
+LBB0_943:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 5
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_944
+
+LBB0_523:
+	LONG $0x1479e3c4; WORD $0x07f1 // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_524
+
+LBB0_945:
+	LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb    xmm1, xmm14, byte [rdi + r9], 7
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x08f1             // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_946
+
+LBB0_525:
+	LONG $0x24548b48; BYTE $0x58   // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_527
+
+LBB0_526:
+	LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb    xmm1, xmm14, byte [rdi + r15], 9
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_527:
+	QUAD $0x0000009024848b48                   // mov    rax, qword [rsp + 144]
+	QUAD $0x0000008824b48b48                   // mov    rsi, qword [rsp + 136]
+	QUAD $0x00000080249c8b48                   // mov    rbx, qword [rsp + 128]
+	LONG $0x244c8b4c; BYTE $0x78               // mov    r9, qword [rsp + 120]
+	LONG $0x1479e3c4; WORD $0x0af1             // vpextrb    ecx, xmm6, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_528
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb    xmm1, xmm14, byte [rdi + rax], 10
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0bf1             // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_948
+
+LBB0_529:
+	LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_530
+
+LBB0_949:
+	QUAD $0x000000f824848b48                   // mov    rax, qword [rsp + 248]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb    xmm1, xmm14, byte [rdi + rax], 12
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0df1             // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_531
+	JMP  LBB0_532
+
+LBB0_516:
+	LONG $0x1479e3c4; WORD $0x01f1 // vpextrb    ecx, xmm6, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_517
+
+LBB0_940:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 1
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x68               // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_518
+	JMP  LBB0_519
+
+LBB0_520:
+	LONG $0x1479e3c4; WORD $0x04f1 // vpextrb    ecx, xmm6, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_521
+
+LBB0_942:
+	QUAD $0x00000108248c8b48                   // mov    rcx, qword [rsp + 264]
+	LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb    xmm1, xmm14, byte [rdi + rcx], 4
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x05f1             // vpextrb    ecx, xmm6, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_943
+
+LBB0_522:
+	LONG $0x1479e3c4; WORD $0x06f1 // vpextrb    ecx, xmm6, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_523
+
+LBB0_944:
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb    xmm1, xmm14, byte [rdi + rax], 6
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x07f1             // vpextrb    ecx, xmm6, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_945
+
+LBB0_524:
+	LONG $0x1479e3c4; WORD $0x08f1 // vpextrb    ecx, xmm6, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_525
+
+LBB0_946:
+	LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb    xmm1, xmm14, byte [rdi + rsi], 8
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x24548b48; BYTE $0x58               // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_526
+	JMP  LBB0_527
+
+LBB0_528:
+	LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb    ecx, xmm6, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_529
+
+LBB0_948:
+	QUAD $0x0000010024848b48                   // mov    rax, qword [rsp + 256]
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb    xmm1, xmm14, byte [rdi + rax], 11
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+	LONG $0x1479e3c4; WORD $0x0cf1             // vpextrb    ecx, xmm6, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_949
+
+LBB0_530:
+	LONG $0x1479e3c4; WORD $0x0df1 // vpextrb    ecx, xmm6, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_532
+
+LBB0_531:
+	LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb    xmm1, xmm14, byte [rdi + rdx], 13
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_532:
+	LONG $0x24448b48; BYTE $0x50               // mov    rax, qword [rsp + 80]
+	LONG $0x24548b48; BYTE $0x40               // mov    rdx, qword [rsp + 64]
+	LONG $0x1479e3c4; WORD $0x0ef1             // vpextrb    ecx, xmm6, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_534
+	LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb    xmm1, xmm14, byte [rdi + rax], 14
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_534:
+	LONG $0x1479e3c4; WORD $0x0ff1             // vpextrb    ecx, xmm6, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_536
+	LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb    xmm1, xmm14, byte [rdi + r10], 15
+	LONG $0x020d63c4; WORD $0x0ff1             // vpblendd    ymm14, ymm14, ymm1, 15
+
+LBB0_536:
+	LONG $0x397de3c4; WORD $0x01f1             // vextracti128    xmm1, ymm6, 1
+	LONG $0xc87ef9c5                           // vmovd    eax, xmm1
+	LONG $0x2c244489                           // mov    dword [rsp + 44], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_538
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb    xmm2, xmm2, byte [rdi + rdx], 0
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_538:
+	LONG $0x24448b48; BYTE $0x38               // mov    rax, qword [rsp + 56]
+	LONG $0x1479e3c4; WORD $0x01c9             // vpextrb    ecx, xmm1, 1
+	LONG $0x28244c89                           // mov    dword [rsp + 40], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_539
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb    xmm2, xmm2, byte [rdi + rsi], 1
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x02c9             // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89                           // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_951
+
+LBB0_540:
+	LONG $0x1479e3c4; WORD $0x03c9 // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89               // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_541
+
+LBB0_952:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb    xmm2, xmm2, byte [rdi + r9], 3
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x04c9             // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89                           // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_953
+
+LBB0_542:
+	LONG $0x1479e3c4; WORD $0x05c8 // vpextrb    eax, xmm1, 5
+	LONG $0x18244489               // mov    dword [rsp + 24], eax
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_544
+
+LBB0_543:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb    xmm2, xmm2, byte [rdi + r13], 5
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+
+LBB0_544:
+	LONG $0x24448b48; BYTE $0x70               // mov    rax, qword [rsp + 112]
+	LONG $0x1479e3c4; WORD $0x06c9             // vpextrb    ecx, xmm1, 6
+	LONG $0x14244c89                           // mov    dword [rsp + 20], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_545
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 6
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x07c9             // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_955
+
+LBB0_546:
+	LONG $0x1479e3c4; WORD $0x08ca // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01       // test    dl, 1
+	JE   LBB0_547
+
+LBB0_956:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d824848b48                   // mov    rax, qword [rsp + 216]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 8
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x09c9             // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_957
+
+LBB0_548:
+	LONG $0x1479e3c4; WORD $0x0ace // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640               // test    sil, 1
+	JE   LBB0_549
+
+LBB0_958:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c824848b48                   // mov    rax, qword [rsp + 200]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb    xmm2, xmm2, byte [rdi + rax], 10
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0bc8             // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_959
+
+LBB0_550:
+	LONG $0x1479c3c4; WORD $0x0ccd // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641               // test    r13b, 1
+	JE   LBB0_551
+
+LBB0_960:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b8249c8b48                   // mov    rbx, qword [rsp + 184]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 12
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0dca             // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641                           // test    r10b, 1
+	JNE  LBB0_961
+
+LBB0_552:
+	LONG $0x1479c3c4; WORD $0x0ecb // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641               // test    r11b, 1
+	JE   LBB0_553
+
+LBB0_962:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000a8249c8b48                   // mov    rbx, qword [rsp + 168]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 14
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0fce             // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641                           // test    r14b, 1
+	JNE  LBB0_554
+	JMP  LBB0_555
+
+LBB0_539:
+	LONG $0x1479e3c4; WORD $0x02c9 // vpextrb    ecx, xmm1, 2
+	LONG $0x24244c89               // mov    dword [rsp + 36], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_540
+
+LBB0_951:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 2
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x03c9             // vpextrb    ecx, xmm1, 3
+	LONG $0x20244c89                           // mov    dword [rsp + 32], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_952
+
+LBB0_541:
+	LONG $0x1479e3c4; WORD $0x04c9 // vpextrb    ecx, xmm1, 4
+	LONG $0x1c244c89               // mov    dword [rsp + 28], ecx
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_542
+
+LBB0_953:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 4
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x05c8             // vpextrb    eax, xmm1, 5
+	LONG $0x18244489                           // mov    dword [rsp + 24], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_543
+	JMP  LBB0_544
+
+LBB0_545:
+	LONG $0x1479c3c4; WORD $0x07c9 // vpextrb    r9d, xmm1, 7
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB0_546
+
+LBB0_955:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000f024848b48                   // mov    rax, qword [rsp + 240]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 7
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x08ca             // vpextrb    edx, xmm1, 8
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	JNE  LBB0_956
+
+LBB0_547:
+	LONG $0x1479e3c4; WORD $0x09c9 // vpextrb    ecx, xmm1, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_548
+
+LBB0_957:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000d024848b48                   // mov    rax, qword [rsp + 208]
+	LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb    xmm2, xmm2, byte [rdi + rax], 9
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479e3c4; WORD $0x0ace             // vpextrb    esi, xmm1, 10
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_958
+
+LBB0_549:
+	LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb    eax, xmm1, 11
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_550
+
+LBB0_959:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000c0249c8b48                   // mov    rbx, qword [rsp + 192]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 11
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ccd             // vpextrb    r13d, xmm1, 12
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_960
+
+LBB0_551:
+	LONG $0x1479c3c4; WORD $0x0dca // vpextrb    r10d, xmm1, 13
+	LONG $0x01c2f641               // test    r10b, 1
+	JE   LBB0_552
+
+LBB0_961:
+	LONG $0x397d63c4; WORD $0x01f2             // vextracti128    xmm2, ymm14, 1
+	QUAD $0x000000b0249c8b48                   // mov    rbx, qword [rsp + 176]
+	LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb    xmm2, xmm2, byte [rdi + rbx], 13
+	LONG $0x380d63c4; WORD $0x01f2             // vinserti128    ymm14, ymm14, xmm2, 1
+	LONG $0x1479c3c4; WORD $0x0ecb             // vpextrb    r11d, xmm1, 14
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_962
+
+LBB0_553:
+	LONG $0x1479c3c4; WORD $0x0fce // vpextrb    r14d, xmm1, 15
+	LONG $0x01c6f641               // test    r14b, 1
+	JE   LBB0_555
+
+LBB0_554:
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	QUAD $0x000000a0249c8b48                   // mov    rbx, qword [rsp + 160]
+	LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb    xmm1, xmm1, byte [rdi + rbx], 15
+	LONG $0x380d63c4; WORD $0x01f1             // vinserti128    ymm14, ymm14, xmm1, 1
+
+LBB0_555:
+	LONG $0x7175c1c4; WORD $0x06d6             // vpsrlw    ymm1, ymm14, 6
+	QUAD $0x00000080b5db75c5                   // vpand    ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0x7e79c1c4; BYTE $0xf7               // vmovd    r15d, xmm6
+	LONG $0x01c7f641                           // test    r15b, 1
+	JE   LBB0_556
+	LONG $0x7ef961c4; BYTE $0xfb               // vmovq    rbx, xmm15
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm14, 0
+	LONG $0x1479e3c4; WORD $0x01f3             // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_964
+
+LBB0_557:
+	LONG $0x1479e3c4; WORD $0x02f3 // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	QUAD $0x000000e024bc8b4c       // mov    r15, qword [rsp + 224]
+	JE   LBB0_558
+
+LBB0_965:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm14, 2
+	LONG $0x1479e3c4; WORD $0x03f3             // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_966
+
+LBB0_559:
+	LONG $0x1479e3c4; WORD $0x04f3 // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_560
+
+LBB0_967:
+	LONG $0x7ef9e1c4; BYTE $0xeb               // vmovq    rbx, xmm5
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm14, 4
+	LONG $0x1479e3c4; WORD $0x05f3             // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_968
+
+LBB0_561:
+	LONG $0x1479e3c4; WORD $0x06f3 // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_562
+
+LBB0_969:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm14, 6
+	LONG $0x1479e3c4; WORD $0x07f3             // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_970
+
+LBB0_563:
+	LONG $0x1479e3c4; WORD $0x08f3 // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_564
+
+LBB0_971:
+	LONG $0x7ef961c4; BYTE $0xe3               // vmovq    rbx, xmm12
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb    byte [r8 + rbx], xmm14, 8
+	LONG $0x1479e3c4; WORD $0x09f3             // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_972
+
+LBB0_565:
+	LONG $0x1479e3c4; WORD $0x0af3 // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_566
+
+LBB0_973:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb    byte [r8 + rbx], xmm14, 10
+	LONG $0x1479e3c4; WORD $0x0bf3             // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_974
+
+LBB0_567:
+	LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_568
+
+LBB0_975:
+	LONG $0x7ef961c4; BYTE $0xdb               // vmovq    rbx, xmm11
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb    byte [r8 + rbx], xmm14, 12
+	LONG $0x1479e3c4; WORD $0x0df3             // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_976
+
+LBB0_569:
+	LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_570
+
+LBB0_977:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb    byte [r8 + rbx], xmm14, 14
+	LONG $0x1479e3c4; WORD $0x0ff3             // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_978
+
+LBB0_571:
+	LONG $0x2c2444f6; BYTE $0x01 // test    byte [rsp + 44], 1
+	JE   LBB0_572
+
+LBB0_979:
+	LONG $0x7ef961c4; BYTE $0xd3               // vmovq    rbx, xmm10
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm1, 0
+	LONG $0x282444f6; BYTE $0x01               // test    byte [rsp + 40], 1
+	JNE  LBB0_980
+
+LBB0_573:
+	LONG $0x242444f6; BYTE $0x01 // test    byte [rsp + 36], 1
+	JE   LBB0_574
+
+LBB0_981:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm1, 2
+	LONG $0x202444f6; BYTE $0x01               // test    byte [rsp + 32], 1
+	JNE  LBB0_982
+
+LBB0_575:
+	LONG $0x1c2444f6; BYTE $0x01 // test    byte [rsp + 28], 1
+	JE   LBB0_576
+
+LBB0_983:
+	LONG $0x7ef961c4; BYTE $0xcb               // vmovq    rbx, xmm9
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm1, 4
+	LONG $0x182444f6; BYTE $0x01               // test    byte [rsp + 24], 1
+	JNE  LBB0_984
+
+LBB0_577:
+	LONG $0x142444f6; BYTE $0x01 // test    byte [rsp + 20], 1
+	JE   LBB0_578
+
+LBB0_985:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm1, 6
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_986
+
+LBB0_579:
+	WORD $0xc2f6; BYTE $0x01 // test    dl, 1
+	QUAD $0x00000128249c8b48 // mov    rbx, qword [rsp + 296]
+	JE   LBB0_580
+
+LBB0_987:
+	LONG $0x7ef961c4; BYTE $0xc2               // vmovq    rdx, xmm8
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb    byte [r8 + rdx], xmm1, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_988
+
+LBB0_581:
+	LONG $0x01c6f640         // test    sil, 1
+	QUAD $0x0000013024948b48 // mov    rdx, qword [rsp + 304]
+	JE   LBB0_582
+
+LBB0_989:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+	WORD $0x01a8                               // test    al, 1
+	QUAD $0x0000009824b48b48                   // mov    rsi, qword [rsp + 152]
+	JNE  LBB0_990
+
+LBB0_583:
+	LONG $0x01c5f641 // test    r13b, 1
+	JE   LBB0_584
+
+LBB0_991:
+	LONG $0x7ef9e1c4; BYTE $0xf9               // vmovq    rcx, xmm7
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+	LONG $0x01c2f641                           // test    r10b, 1
+	QUAD $0x0000011824ac8b4c                   // mov    r13, qword [rsp + 280]
+	JNE  LBB0_992
+
+LBB0_585:
+	LONG $0x01c3f641 // test    r11b, 1
+	JE   LBB0_586
+
+LBB0_993:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+	LONG $0x01c6f641                           // test    r14b, 1
+	QUAD $0x0000012024848b48                   // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c                   // mov    r9, qword [rsp + 232]
+	JNE  LBB0_587
+	JMP  LBB0_588
+
+LBB0_556:
+	LONG $0x1479e3c4; WORD $0x01f3 // vpextrb    ebx, xmm6, 1
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_557
+
+LBB0_964:
+	LONG $0x16f963c4; WORD $0x01fb             // vpextrq    rbx, xmm15, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm14, 1
+	LONG $0x1479e3c4; WORD $0x02f3             // vpextrb    ebx, xmm6, 2
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	QUAD $0x000000e024bc8b4c                   // mov    r15, qword [rsp + 224]
+	JNE  LBB0_965
+
+LBB0_558:
+	LONG $0x1479e3c4; WORD $0x03f3 // vpextrb    ebx, xmm6, 3
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_559
+
+LBB0_966:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm14, 3
+	LONG $0x1479e3c4; WORD $0x04f3             // vpextrb    ebx, xmm6, 4
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_967
+
+LBB0_560:
+	LONG $0x1479e3c4; WORD $0x05f3 // vpextrb    ebx, xmm6, 5
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_561
+
+LBB0_968:
+	LONG $0x16f9e3c4; WORD $0x01eb             // vpextrq    rbx, xmm5, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm14, 5
+	LONG $0x1479e3c4; WORD $0x06f3             // vpextrb    ebx, xmm6, 6
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_969
+
+LBB0_562:
+	LONG $0x1479e3c4; WORD $0x07f3 // vpextrb    ebx, xmm6, 7
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_563
+
+LBB0_970:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm14, 7
+	LONG $0x1479e3c4; WORD $0x08f3             // vpextrb    ebx, xmm6, 8
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_971
+
+LBB0_564:
+	LONG $0x1479e3c4; WORD $0x09f3 // vpextrb    ebx, xmm6, 9
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_565
+
+LBB0_972:
+	LONG $0x16f963c4; WORD $0x01e3             // vpextrq    rbx, xmm12, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb    byte [r8 + rbx], xmm14, 9
+	LONG $0x1479e3c4; WORD $0x0af3             // vpextrb    ebx, xmm6, 10
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_973
+
+LBB0_566:
+	LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb    ebx, xmm6, 11
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_567
+
+LBB0_974:
+	LONG $0x397d63c4; WORD $0x01e1             // vextracti128    xmm1, ymm12, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb    byte [r8 + rbx], xmm14, 11
+	LONG $0x1479e3c4; WORD $0x0cf3             // vpextrb    ebx, xmm6, 12
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_975
+
+LBB0_568:
+	LONG $0x1479e3c4; WORD $0x0df3 // vpextrb    ebx, xmm6, 13
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_569
+
+LBB0_976:
+	LONG $0x16f963c4; WORD $0x01db             // vpextrq    rbx, xmm11, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb    byte [r8 + rbx], xmm14, 13
+	LONG $0x1479e3c4; WORD $0x0ef3             // vpextrb    ebx, xmm6, 14
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_977
+
+LBB0_570:
+	LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb    ebx, xmm6, 15
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_571
+
+LBB0_978:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb    byte [r8 + rbx], xmm14, 15
+	LONG $0x2c2444f6; BYTE $0x01               // test    byte [rsp + 44], 1
+	JNE  LBB0_979
+
+LBB0_572:
+	LONG $0x282444f6; BYTE $0x01 // test    byte [rsp + 40], 1
+	JE   LBB0_573
+
+LBB0_980:
+	LONG $0x16f963c4; WORD $0x01d3             // vpextrq    rbx, xmm10, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm1, 1
+	LONG $0x242444f6; BYTE $0x01               // test    byte [rsp + 36], 1
+	JNE  LBB0_981
+
+LBB0_574:
+	LONG $0x202444f6; BYTE $0x01 // test    byte [rsp + 32], 1
+	JE   LBB0_575
+
+LBB0_982:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm1, 3
+	LONG $0x1c2444f6; BYTE $0x01               // test    byte [rsp + 28], 1
+	JNE  LBB0_983
+
+LBB0_576:
+	LONG $0x182444f6; BYTE $0x01 // test    byte [rsp + 24], 1
+	JE   LBB0_577
+
+LBB0_984:
+	LONG $0x16f963c4; WORD $0x01cb             // vpextrq    rbx, xmm9, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm1, 5
+	LONG $0x142444f6; BYTE $0x01               // test    byte [rsp + 20], 1
+	JNE  LBB0_985
+
+LBB0_578:
+	LONG $0x01c1f641 // test    r9b, 1
+	JE   LBB0_579
+
+LBB0_986:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm1, 7
+	WORD $0xc2f6; BYTE $0x01                   // test    dl, 1
+	QUAD $0x00000128249c8b48                   // mov    rbx, qword [rsp + 296]
+	JNE  LBB0_987
+
+LBB0_580:
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB0_581
+
+LBB0_988:
+	LONG $0x16f963c4; WORD $0x01c1             // vpextrq    rcx, xmm8, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+	LONG $0x01c6f640                           // test    sil, 1
+	QUAD $0x0000013024948b48                   // mov    rdx, qword [rsp + 304]
+	JNE  LBB0_989
+
+LBB0_582:
+	WORD $0x01a8             // test    al, 1
+	QUAD $0x0000009824b48b48 // mov    rsi, qword [rsp + 152]
+	JE   LBB0_583
+
+LBB0_990:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm1, 11
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_991
+
+LBB0_584:
+	LONG $0x01c2f641         // test    r10b, 1
+	QUAD $0x0000011824ac8b4c // mov    r13, qword [rsp + 280]
+	JE   LBB0_585
+
+LBB0_992:
+	LONG $0x16f9e3c4; WORD $0x01f9             // vpextrq    rcx, xmm7, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_993
+
+LBB0_586:
+	LONG $0x01c6f641         // test    r14b, 1
+	QUAD $0x0000012024848b48 // mov    rax, qword [rsp + 288]
+	QUAD $0x000000e8248c8b4c // mov    r9, qword [rsp + 232]
+	JE   LBB0_588
+
+LBB0_587:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397d63c4; WORD $0x01f1             // vextracti128    xmm1, ymm14, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+
+LBB0_588:
+	QUAD $0x000240248c6ffdc5; BYTE $0x00 // vmovdqa    ymm1, yword [rsp + 576]
+	QUAD $0x000200249ceb75c5; BYTE $0x00 // vpor    ymm11, ymm1, yword [rsp + 512]
+	QUAD $0x0001e02494eb75c5; BYTE $0x00 // vpor    ymm10, ymm1, yword [rsp + 480]
+	QUAD $0x0001802484eb75c5; BYTE $0x00 // vpor    ymm8, ymm1, yword [rsp + 384]
+	QUAD $0x00016024bcebf5c5; BYTE $0x00 // vpor    ymm7, ymm1, yword [rsp + 352]
+	QUAD $0x0001c0248ceb75c5; BYTE $0x00 // vpor    ymm9, ymm1, yword [rsp + 448]
+	QUAD $0x0001a024acebf5c5; BYTE $0x00 // vpor    ymm5, ymm1, yword [rsp + 416]
+	QUAD $0x0001402494ebf5c5; BYTE $0x00 // vpor    ymm2, ymm1, yword [rsp + 320]
+	LONG $0xf9eb5dc5                     // vpor    ymm15, ymm4, ymm1
+	LONG $0x466dc3c4; WORD $0x31df       // vperm2i128    ymm3, ymm2, ymm15, 49
+	LONG $0x386dc3c4; WORD $0x01e7       // vinserti128    ymm4, ymm2, xmm15, 1
+	LONG $0xdbc6dcc5; BYTE $0x88         // vshufps    ymm3, ymm4, ymm3, 136
+	LONG $0x4635e3c4; WORD $0x31e5       // vperm2i128    ymm4, ymm9, ymm5, 49
+	LONG $0x383563c4; WORD $0x01e5       // vinserti128    ymm12, ymm9, xmm5, 1
+	LONG $0xe4c69cc5; BYTE $0x88         // vshufps    ymm4, ymm12, ymm4, 136
+	LONG $0x463d63c4; WORD $0x31e7       // vperm2i128    ymm12, ymm8, ymm7, 49
+	LONG $0x383d63c4; WORD $0x01ef       // vinserti128    ymm13, ymm8, xmm7, 1
+	LONG $0xc61441c4; WORD $0x88e4       // vshufps    ymm12, ymm13, ymm12, 136
+	LONG $0x462543c4; WORD $0x31ea       // vperm2i128    ymm13, ymm11, ymm10, 49
+	LONG $0x382543c4; WORD $0x01f2       // vinserti128    ymm14, ymm11, xmm10, 1
+	LONG $0xc60c41c4; WORD $0x88ed       // vshufps    ymm13, ymm14, ymm13, 136
+	LONG $0x667d41c4; BYTE $0xed         // vpcmpgtd    ymm13, ymm0, ymm13
+	LONG $0x667d41c4; BYTE $0xe4         // vpcmpgtd    ymm12, ymm0, ymm12
+	LONG $0x6b1541c4; BYTE $0xe4         // vpackssdw    ymm12, ymm13, ymm12
+	LONG $0x00fd43c4; WORD $0xd8e4       // vpermq    ymm12, ymm12, 216
+	LONG $0xe466fdc5                     // vpcmpgtd    ymm4, ymm0, ymm4
+	LONG $0xdb66fdc5                     // vpcmpgtd    ymm3, ymm0, ymm3
+	LONG $0xdb6bddc5                     // vpackssdw    ymm3, ymm4, ymm3
+	LONG $0x00fde3c4; WORD $0xd8db       // vpermq    ymm3, ymm3, 216
+	LONG $0xdb639dc5                     // vpacksswb    ymm3, ymm12, ymm3
+	LONG $0xdedbe5c5                     // vpand    ymm3, ymm3, ymm6
+	LONG $0xd97ef9c5                     // vmovd    ecx, xmm3
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JE   LBB0_589
+	LONG $0x787de2c4; WORD $0x1724       // vpbroadcastb    ymm4, byte [rdi + rdx]
+	LONG $0x1479e3c4; WORD $0x01d9       // vpextrb    ecx, xmm3, 1
+	WORD $0xc1f6; BYTE $0x01             // test    cl, 1
+	JNE  LBB0_995
+
+LBB0_590:
+	LONG $0x24548b48; BYTE $0x68   // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02d9 // vpextrb    ecx, xmm3, 2
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_592
+
+LBB0_591:
+	LONG $0x2059e3c4; WORD $0x1f34; BYTE $0x02 // vpinsrb    xmm6, xmm4, byte [rdi + rbx], 2
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+
+LBB0_592:
+	LONG $0x24748b48; BYTE $0x60               // mov    rsi, qword [rsp + 96]
+	LONG $0x24548b4c; BYTE $0x48               // mov    r10, qword [rsp + 72]
+	LONG $0x1479e3c4; WORD $0x03d9             // vpextrb    ecx, xmm3, 3
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_593
+	QUAD $0x00000110248c8b48                   // mov    rcx, qword [rsp + 272]
+	LONG $0x2059e3c4; WORD $0x0f34; BYTE $0x03 // vpinsrb    xmm6, xmm4, byte [rdi + rcx], 3
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x04d9             // vpextrb    ecx, xmm3, 4
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_997
+
+LBB0_594:
+	LONG $0x1479e3c4; WORD $0x05d9 // vpextrb    ecx, xmm3, 5
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_595
+
+LBB0_998:
+	LONG $0x2059e3c4; WORD $0x1734; BYTE $0x05 // vpinsrb    xmm6, xmm4, byte [rdi + rdx], 5
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x06d9             // vpextrb    ecx, xmm3, 6
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_999
+
+LBB0_596:
+	LONG $0x1479e3c4; WORD $0x07d9 // vpextrb    ecx, xmm3, 7
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_597
+
+LBB0_1000:
+	LONG $0x2059a3c4; WORD $0x0f34; BYTE $0x07 // vpinsrb    xmm6, xmm4, byte [rdi + r9], 7
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x08d9             // vpextrb    ecx, xmm3, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1001
+
+LBB0_598:
+	LONG $0x24548b48; BYTE $0x58   // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09d9 // vpextrb    ecx, xmm3, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_600
+
+LBB0_599:
+	LONG $0x2059a3c4; WORD $0x3f34; BYTE $0x09 // vpinsrb    xmm6, xmm4, byte [rdi + r15], 9
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+
+LBB0_600:
+	QUAD $0x0000009024848b48                   // mov    rax, qword [rsp + 144]
+	QUAD $0x0000008824b48b48                   // mov    rsi, qword [rsp + 136]
+	QUAD $0x00000080249c8b48                   // mov    rbx, qword [rsp + 128]
+	LONG $0x244c8b4c; BYTE $0x78               // mov    r9, qword [rsp + 120]
+	LONG $0x1479e3c4; WORD $0x0ad9             // vpextrb    ecx, xmm3, 10
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_601
+	LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0a // vpinsrb    xmm6, xmm4, byte [rdi + rax], 10
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x0bd9             // vpextrb    ecx, xmm3, 11
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1003
+
+LBB0_602:
+	LONG $0x1479e3c4; WORD $0x0cd9 // vpextrb    ecx, xmm3, 12
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_603
+
+LBB0_1004:
+	QUAD $0x000000f824848b48                   // mov    rax, qword [rsp + 248]
+	LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0c // vpinsrb    xmm6, xmm4, byte [rdi + rax], 12
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x0dd9             // vpextrb    ecx, xmm3, 13
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_604
+	JMP  LBB0_605
+
+LBB0_589:
+	LONG $0x1479e3c4; WORD $0x01d9 // vpextrb    ecx, xmm3, 1
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_590
+
+LBB0_995:
+	LONG $0x2059e3c4; WORD $0x3734; BYTE $0x01 // vpinsrb    xmm6, xmm4, byte [rdi + rsi], 1
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x24548b48; BYTE $0x68               // mov    rdx, qword [rsp + 104]
+	LONG $0x1479e3c4; WORD $0x02d9             // vpextrb    ecx, xmm3, 2
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_591
+	JMP  LBB0_592
+
+LBB0_593:
+	LONG $0x1479e3c4; WORD $0x04d9 // vpextrb    ecx, xmm3, 4
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_594
+
+LBB0_997:
+	QUAD $0x00000108248c8b48                   // mov    rcx, qword [rsp + 264]
+	LONG $0x2059e3c4; WORD $0x0f34; BYTE $0x04 // vpinsrb    xmm6, xmm4, byte [rdi + rcx], 4
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x05d9             // vpextrb    ecx, xmm3, 5
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_998
+
+LBB0_595:
+	LONG $0x1479e3c4; WORD $0x06d9 // vpextrb    ecx, xmm3, 6
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_596
+
+LBB0_999:
+	LONG $0x2059e3c4; WORD $0x0734; BYTE $0x06 // vpinsrb    xmm6, xmm4, byte [rdi + rax], 6
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x07d9             // vpextrb    ecx, xmm3, 7
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1000
+
+LBB0_597:
+	LONG $0x1479e3c4; WORD $0x08d9 // vpextrb    ecx, xmm3, 8
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_598
+
+LBB0_1001:
+	LONG $0x2059e3c4; WORD $0x3734; BYTE $0x08 // vpinsrb    xmm6, xmm4, byte [rdi + rsi], 8
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x24548b48; BYTE $0x58               // mov    rdx, qword [rsp + 88]
+	LONG $0x1479e3c4; WORD $0x09d9             // vpextrb    ecx, xmm3, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_599
+	JMP  LBB0_600
+
+LBB0_601:
+	LONG $0x1479e3c4; WORD $0x0bd9 // vpextrb    ecx, xmm3, 11
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_602
+
+LBB0_1003:
+	QUAD $0x0000010024848b48                   // mov    rax, qword [rsp + 256]
+	LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0b // vpinsrb    xmm6, xmm4, byte [rdi + rax], 11
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+	LONG $0x1479e3c4; WORD $0x0cd9             // vpextrb    ecx, xmm3, 12
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1004
+
+LBB0_603:
+	LONG $0x1479e3c4; WORD $0x0dd9 // vpextrb    ecx, xmm3, 13
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_605
+
+LBB0_604:
+	LONG $0x2059e3c4; WORD $0x1734; BYTE $0x0d // vpinsrb    xmm6, xmm4, byte [rdi + rdx], 13
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+
+LBB0_605:
+	LONG $0x24448b48; BYTE $0x50               // mov    rax, qword [rsp + 80]
+	LONG $0x24548b48; BYTE $0x40               // mov    rdx, qword [rsp + 64]
+	LONG $0x1479e3c4; WORD $0x0ed9             // vpextrb    ecx, xmm3, 14
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_607
+	LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0e // vpinsrb    xmm6, xmm4, byte [rdi + rax], 14
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+
+LBB0_607:
+	LONG $0x1479e3c4; WORD $0x0fd9             // vpextrb    ecx, xmm3, 15
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_609
+	LONG $0x2059a3c4; WORD $0x1734; BYTE $0x0f // vpinsrb    xmm6, xmm4, byte [rdi + r10], 15
+	LONG $0x025de3c4; WORD $0x0fe6             // vpblendd    ymm4, ymm4, ymm6, 15
+
+LBB0_609:
+	LONG $0x397de3c4; WORD $0x01de             // vextracti128    xmm6, ymm3, 1
+	LONG $0xf07ef9c5                           // vmovd    eax, xmm6
+	LONG $0x00248489; WORD $0x0002; BYTE $0x00 // mov    dword [rsp + 512], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_611
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x170c; BYTE $0x00 // vpinsrb    xmm1, xmm1, byte [rdi + rdx], 0
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+
+LBB0_611:
+	LONG $0x24448b48; BYTE $0x38               // mov    rax, qword [rsp + 56]
+	LONG $0x1479e3c4; WORD $0x01f1             // vpextrb    ecx, xmm6, 1
+	LONG $0xe0248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 480], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_612
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb    xmm1, xmm1, byte [rdi + rsi], 1
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	LONG $0xc0248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 448], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1006
+
+LBB0_613:
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	LONG $0xa0248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 416], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_614
+
+LBB0_1007:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071a3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb    xmm1, xmm1, byte [rdi + r9], 3
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	LONG $0x80248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 384], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1008
+
+LBB0_615:
+	LONG $0x1479e3c4; WORD $0x05f0             // vpextrb    eax, xmm6, 5
+	LONG $0x60248489; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 352], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_617
+
+LBB0_616:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071a3c4; WORD $0x2f0c; BYTE $0x05 // vpinsrb    xmm1, xmm1, byte [rdi + r13], 5
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+
+LBB0_617:
+	LONG $0x24448b48; BYTE $0x70               // mov    rax, qword [rsp + 112]
+	QUAD $0x000000b8249c8b48                   // mov    rbx, qword [rsp + 184]
+	QUAD $0x000000b024948b48                   // mov    rdx, qword [rsp + 176]
+	LONG $0x1479e3c4; WORD $0x06f1             // vpextrb    ecx, xmm6, 6
+	LONG $0x40248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 320], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_618
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 6
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x07f0             // vpextrb    eax, xmm6, 7
+	LONG $0x98248489; WORD $0x0000; BYTE $0x00 // mov    dword [rsp + 152], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_1010
+
+LBB0_619:
+	LONG $0x1479c3c4; WORD $0x08f1 // vpextrb    r9d, xmm6, 8
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB0_620
+
+LBB0_1011:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	QUAD $0x000000d824848b48                   // mov    rax, qword [rsp + 216]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x08 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 8
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x09f1             // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1012
+
+LBB0_621:
+	LONG $0x1479c3c4; WORD $0x0af3 // vpextrb    r11d, xmm6, 10
+	LONG $0x01c3f641               // test    r11b, 1
+	JE   LBB0_622
+
+LBB0_1013:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	QUAD $0x000000c824848b48                   // mov    rax, qword [rsp + 200]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb    xmm1, xmm1, byte [rdi + rax], 10
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x0bf0             // vpextrb    eax, xmm6, 11
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_1014
+
+LBB0_623:
+	LONG $0x1479e3c4; WORD $0x0cf6 // vpextrb    esi, xmm6, 12
+	LONG $0x01c6f640               // test    sil, 1
+	JE   LBB0_624
+
+LBB0_1015:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0c // vpinsrb    xmm1, xmm1, byte [rdi + rbx], 12
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479c3c4; WORD $0x0df2             // vpextrb    r10d, xmm6, 13
+	LONG $0x01c2f641                           // test    r10b, 1
+	JNE  LBB0_1016
+
+LBB0_625:
+	QUAD $0x000000a824948b48       // mov    rdx, qword [rsp + 168]
+	LONG $0x1479c3c4; WORD $0x0ef5 // vpextrb    r13d, xmm6, 14
+	LONG $0x01c5f641               // test    r13b, 1
+	JE   LBB0_626
+
+LBB0_1017:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x170c; BYTE $0x0e // vpinsrb    xmm1, xmm1, byte [rdi + rdx], 14
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	QUAD $0x000000a024948b48                   // mov    rdx, qword [rsp + 160]
+	LONG $0x1479c3c4; WORD $0x0ff6             // vpextrb    r14d, xmm6, 15
+	LONG $0x01c6f641                           // test    r14b, 1
+	JNE  LBB0_627
+	JMP  LBB0_628
+
+LBB0_612:
+	LONG $0x1479e3c4; WORD $0x02f1             // vpextrb    ecx, xmm6, 2
+	LONG $0xc0248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 448], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_613
+
+LBB0_1006:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb    xmm1, xmm1, byte [rdi + rbx], 2
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x03f1             // vpextrb    ecx, xmm6, 3
+	LONG $0xa0248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 416], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1007
+
+LBB0_614:
+	LONG $0x1479e3c4; WORD $0x04f1             // vpextrb    ecx, xmm6, 4
+	LONG $0x80248c89; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 384], ecx
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JE   LBB0_615
+
+LBB0_1008:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x04 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 4
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x05f0             // vpextrb    eax, xmm6, 5
+	LONG $0x60248489; WORD $0x0001; BYTE $0x00 // mov    dword [rsp + 352], eax
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_616
+	JMP  LBB0_617
+
+LBB0_618:
+	LONG $0x1479e3c4; WORD $0x07f0             // vpextrb    eax, xmm6, 7
+	LONG $0x98248489; WORD $0x0000; BYTE $0x00 // mov    dword [rsp + 152], eax
+	WORD $0x01a8                               // test    al, 1
+	JE   LBB0_619
+
+LBB0_1010:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	QUAD $0x000000f024848b48                   // mov    rax, qword [rsp + 240]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x07 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 7
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479c3c4; WORD $0x08f1             // vpextrb    r9d, xmm6, 8
+	LONG $0x01c1f641                           // test    r9b, 1
+	JNE  LBB0_1011
+
+LBB0_620:
+	LONG $0x1479e3c4; WORD $0x09f1 // vpextrb    ecx, xmm6, 9
+	WORD $0xc1f6; BYTE $0x01       // test    cl, 1
+	JE   LBB0_621
+
+LBB0_1012:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	QUAD $0x000000d024848b48                   // mov    rax, qword [rsp + 208]
+	LONG $0x2071e3c4; WORD $0x070c; BYTE $0x09 // vpinsrb    xmm1, xmm1, byte [rdi + rax], 9
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479c3c4; WORD $0x0af3             // vpextrb    r11d, xmm6, 10
+	LONG $0x01c3f641                           // test    r11b, 1
+	JNE  LBB0_1013
+
+LBB0_622:
+	LONG $0x1479e3c4; WORD $0x0bf0 // vpextrb    eax, xmm6, 11
+	WORD $0x01a8                   // test    al, 1
+	JE   LBB0_623
+
+LBB0_1014:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	QUAD $0x000000c024b48b48                   // mov    rsi, qword [rsp + 192]
+	LONG $0x2071e3c4; WORD $0x370c; BYTE $0x0b // vpinsrb    xmm1, xmm1, byte [rdi + rsi], 11
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	LONG $0x1479e3c4; WORD $0x0cf6             // vpextrb    esi, xmm6, 12
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_1015
+
+LBB0_624:
+	LONG $0x1479c3c4; WORD $0x0df2 // vpextrb    r10d, xmm6, 13
+	LONG $0x01c2f641               // test    r10b, 1
+	JE   LBB0_625
+
+LBB0_1016:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb    xmm1, xmm1, byte [rdi + rdx], 13
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+	QUAD $0x000000a824948b48                   // mov    rdx, qword [rsp + 168]
+	LONG $0x1479c3c4; WORD $0x0ef5             // vpextrb    r13d, xmm6, 14
+	LONG $0x01c5f641                           // test    r13b, 1
+	JNE  LBB0_1017
+
+LBB0_626:
+	QUAD $0x000000a024948b48       // mov    rdx, qword [rsp + 160]
+	LONG $0x1479c3c4; WORD $0x0ff6 // vpextrb    r14d, xmm6, 15
+	LONG $0x01c6f641               // test    r14b, 1
+	JE   LBB0_628
+
+LBB0_627:
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x2071e3c4; WORD $0x170c; BYTE $0x0f // vpinsrb    xmm1, xmm1, byte [rdi + rdx], 15
+	LONG $0x385de3c4; WORD $0x01e1             // vinserti128    ymm4, ymm4, xmm1, 1
+
+LBB0_628:
+	LONG $0xd471f5c5; BYTE $0x07               // vpsrlw    ymm1, ymm4, 7
+	QUAD $0x00000080a5dbf5c5                   // vpand    ymm4, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */
+	LONG $0x7e79c1c4; BYTE $0xdf               // vmovd    r15d, xmm3
+	LONG $0x01c7f641                           // test    r15b, 1
+	JE   LBB0_629
+	LONG $0x7ef961c4; BYTE $0xdb               // vmovq    rbx, xmm11
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm4, 0
+	LONG $0x1479e3c4; WORD $0x01db             // vpextrb    ebx, xmm3, 1
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1019
+
+LBB0_630:
+	LONG $0x1479e3c4; WORD $0x02db // vpextrb    ebx, xmm3, 2
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_631
+
+LBB0_1020:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm4, 2
+	LONG $0x1479e3c4; WORD $0x03db             // vpextrb    ebx, xmm3, 3
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1021
+
+LBB0_632:
+	LONG $0x1479e3c4; WORD $0x04db // vpextrb    ebx, xmm3, 4
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_633
+
+LBB0_1022:
+	LONG $0x7ef961c4; BYTE $0xd3               // vmovq    rbx, xmm10
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm4, 4
+	LONG $0x1479e3c4; WORD $0x05db             // vpextrb    ebx, xmm3, 5
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1023
+
+LBB0_634:
+	LONG $0x1479e3c4; WORD $0x06db // vpextrb    ebx, xmm3, 6
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_635
+
+LBB0_1024:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm4, 6
+	LONG $0x1479e3c4; WORD $0x07db             // vpextrb    ebx, xmm3, 7
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1025
+
+LBB0_636:
+	LONG $0x1479e3c4; WORD $0x08db // vpextrb    ebx, xmm3, 8
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_637
+
+LBB0_1026:
+	LONG $0x7ef961c4; BYTE $0xcb               // vmovq    rbx, xmm9
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x08 // vpextrb    byte [r8 + rbx], xmm4, 8
+	LONG $0x1479e3c4; WORD $0x09db             // vpextrb    ebx, xmm3, 9
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1027
+
+LBB0_638:
+	LONG $0x1479e3c4; WORD $0x0adb // vpextrb    ebx, xmm3, 10
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_639
+
+LBB0_1028:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0a // vpextrb    byte [r8 + rbx], xmm4, 10
+	LONG $0x1479e3c4; WORD $0x0bdb             // vpextrb    ebx, xmm3, 11
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1029
+
+LBB0_640:
+	LONG $0x1479e3c4; WORD $0x0cdb // vpextrb    ebx, xmm3, 12
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_641
+
+LBB0_1030:
+	LONG $0x7ef9e1c4; BYTE $0xeb               // vmovq    rbx, xmm5
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0c // vpextrb    byte [r8 + rbx], xmm4, 12
+	LONG $0x1479e3c4; WORD $0x0ddb             // vpextrb    ebx, xmm3, 13
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	QUAD $0x000380248c6f7dc5; BYTE $0x00       // vmovdqa    ymm9, yword [rsp + 896]
+	JNE  LBB0_1031
+
+LBB0_642:
+	LONG $0x1479e3c4; WORD $0x0edb // vpextrb    ebx, xmm3, 14
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_643
+
+LBB0_1032:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0e // vpextrb    byte [r8 + rbx], xmm4, 14
+	LONG $0x1479e3c4; WORD $0x0fdb             // vpextrb    ebx, xmm3, 15
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1033
+
+LBB0_644:
+	QUAD $0x01000002002484f6             // test    byte [rsp + 512], 1
+	QUAD $0x000340249c6ffdc5; BYTE $0x00 // vmovdqa    ymm3, yword [rsp + 832]
+	JE   LBB0_645
+
+LBB0_1034:
+	LONG $0x7ef961c4; BYTE $0xc3               // vmovq    rbx, xmm8
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb    byte [r8 + rbx], xmm1, 0
+	QUAD $0x01000001e02484f6                   // test    byte [rsp + 480], 1
+	JNE  LBB0_1035
+
+LBB0_646:
+	QUAD $0x01000001c02484f6 // test    byte [rsp + 448], 1
+	JE   LBB0_647
+
+LBB0_1036:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb    byte [r8 + rbx], xmm1, 2
+	QUAD $0x01000001a02484f6                   // test    byte [rsp + 416], 1
+	JNE  LBB0_1037
+
+LBB0_648:
+	QUAD $0x01000001802484f6 // test    byte [rsp + 384], 1
+	JE   LBB0_649
+
+LBB0_1038:
+	LONG $0x7ef9e1c4; BYTE $0xfb               // vmovq    rbx, xmm7
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb    byte [r8 + rbx], xmm1, 4
+	QUAD $0x01000001602484f6                   // test    byte [rsp + 352], 1
+	QUAD $0x00036024846f7dc5; BYTE $0x00       // vmovdqa    ymm8, yword [rsp + 864]
+	JNE  LBB0_1039
+
+LBB0_650:
+	QUAD $0x01000001402484f6 // test    byte [rsp + 320], 1
+	JE   LBB0_651
+
+LBB0_1040:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x7ef9e1c4; BYTE $0xcb               // vmovq    rbx, xmm1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb    byte [r8 + rbx], xmm1, 6
+	QUAD $0x01000000982484f6                   // test    byte [rsp + 152], 1
+	JNE  LBB0_1041
+
+LBB0_652:
+	LONG $0x01c1f641             // test    r9b, 1
+	LONG $0x244c8b44; BYTE $0x10 // mov    r9d, dword [rsp + 16]
+	JE   LBB0_653
+
+LBB0_1042:
+	LONG $0x7ef9e1c4; BYTE $0xd2               // vmovq    rdx, xmm2
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb    byte [r8 + rdx], xmm1, 8
+	WORD $0xc1f6; BYTE $0x01                   // test    cl, 1
+	JNE  LBB0_1043
+
+LBB0_654:
+	LONG $0x01c3f641         // test    r11b, 1
+	QUAD $0x00000130249c8b4c // mov    r11, qword [rsp + 304]
+	JE   LBB0_655
+
+LBB0_1044:
+	LONG $0x397de3c4; WORD $0x01d1             // vextracti128    xmm1, ymm2, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb    byte [r8 + rcx], xmm1, 10
+	WORD $0x01a8                               // test    al, 1
+	JNE  LBB0_1045
+
+LBB0_656:
+	LONG $0x01c6f640 // test    sil, 1
+	JE   LBB0_657
+
+LBB0_1046:
+	LONG $0x7ef961c4; BYTE $0xf9               // vmovq    rcx, xmm15
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb    byte [r8 + rcx], xmm1, 12
+	LONG $0x01c2f641                           // test    r10b, 1
+	QUAD $0x00032024946ffdc5; BYTE $0x00       // vmovdqa    ymm2, yword [rsp + 800]
+	JNE  LBB0_1047
+
+LBB0_658:
+	LONG $0x01c5f641             // test    r13b, 1
+	LONG $0x24548b4c; BYTE $0x30 // mov    r10, qword [rsp + 48]
+	JE   LBB0_659
+
+LBB0_1048:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x7ef9e1c4; BYTE $0xc9               // vmovq    rcx, xmm1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb    byte [r8 + rcx], xmm1, 14
+	LONG $0x01c6f641                           // test    r14b, 1
+	JE   LBB0_25
+	JMP  LBB0_1049
+
+LBB0_629:
+	LONG $0x1479e3c4; WORD $0x01db // vpextrb    ebx, xmm3, 1
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_630
+
+LBB0_1019:
+	LONG $0x16f963c4; WORD $0x01db             // vpextrq    rbx, xmm11, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm4, 1
+	LONG $0x1479e3c4; WORD $0x02db             // vpextrb    ebx, xmm3, 2
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1020
+
+LBB0_631:
+	LONG $0x1479e3c4; WORD $0x03db // vpextrb    ebx, xmm3, 3
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_632
+
+LBB0_1021:
+	LONG $0x397d63c4; WORD $0x01d9             // vextracti128    xmm1, ymm11, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm4, 3
+	LONG $0x1479e3c4; WORD $0x04db             // vpextrb    ebx, xmm3, 4
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1022
+
+LBB0_633:
+	LONG $0x1479e3c4; WORD $0x05db // vpextrb    ebx, xmm3, 5
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_634
+
+LBB0_1023:
+	LONG $0x16f963c4; WORD $0x01d3             // vpextrq    rbx, xmm10, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm4, 5
+	LONG $0x1479e3c4; WORD $0x06db             // vpextrb    ebx, xmm3, 6
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1024
+
+LBB0_635:
+	LONG $0x1479e3c4; WORD $0x07db // vpextrb    ebx, xmm3, 7
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_636
+
+LBB0_1025:
+	LONG $0x397d63c4; WORD $0x01d1             // vextracti128    xmm1, ymm10, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm4, 7
+	LONG $0x1479e3c4; WORD $0x08db             // vpextrb    ebx, xmm3, 8
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1026
+
+LBB0_637:
+	LONG $0x1479e3c4; WORD $0x09db // vpextrb    ebx, xmm3, 9
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_638
+
+LBB0_1027:
+	LONG $0x16f963c4; WORD $0x01cb             // vpextrq    rbx, xmm9, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x09 // vpextrb    byte [r8 + rbx], xmm4, 9
+	LONG $0x1479e3c4; WORD $0x0adb             // vpextrb    ebx, xmm3, 10
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1028
+
+LBB0_639:
+	LONG $0x1479e3c4; WORD $0x0bdb // vpextrb    ebx, xmm3, 11
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_640
+
+LBB0_1029:
+	LONG $0x397d63c4; WORD $0x01c9             // vextracti128    xmm1, ymm9, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0b // vpextrb    byte [r8 + rbx], xmm4, 11
+	LONG $0x1479e3c4; WORD $0x0cdb             // vpextrb    ebx, xmm3, 12
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1030
+
+LBB0_641:
+	LONG $0x1479e3c4; WORD $0x0ddb       // vpextrb    ebx, xmm3, 13
+	WORD $0xc3f6; BYTE $0x01             // test    bl, 1
+	QUAD $0x000380248c6f7dc5; BYTE $0x00 // vmovdqa    ymm9, yword [rsp + 896]
+	JE   LBB0_642
+
+LBB0_1031:
+	LONG $0x16f9e3c4; WORD $0x01eb             // vpextrq    rbx, xmm5, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0d // vpextrb    byte [r8 + rbx], xmm4, 13
+	LONG $0x1479e3c4; WORD $0x0edb             // vpextrb    ebx, xmm3, 14
+	WORD $0xc3f6; BYTE $0x01                   // test    bl, 1
+	JNE  LBB0_1032
+
+LBB0_643:
+	LONG $0x1479e3c4; WORD $0x0fdb // vpextrb    ebx, xmm3, 15
+	WORD $0xc3f6; BYTE $0x01       // test    bl, 1
+	JE   LBB0_644
+
+LBB0_1033:
+	LONG $0x397de3c4; WORD $0x01e9             // vextracti128    xmm1, ymm5, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0f // vpextrb    byte [r8 + rbx], xmm4, 15
+	QUAD $0x01000002002484f6                   // test    byte [rsp + 512], 1
+	QUAD $0x000340249c6ffdc5; BYTE $0x00       // vmovdqa    ymm3, yword [rsp + 832]
+	JNE  LBB0_1034
+
+LBB0_645:
+	QUAD $0x01000001e02484f6 // test    byte [rsp + 480], 1
+	JE   LBB0_646
+
+LBB0_1035:
+	LONG $0x16f963c4; WORD $0x01c3             // vpextrq    rbx, xmm8, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb    byte [r8 + rbx], xmm1, 1
+	QUAD $0x01000001c02484f6                   // test    byte [rsp + 448], 1
+	JNE  LBB0_1036
+
+LBB0_647:
+	QUAD $0x01000001a02484f6 // test    byte [rsp + 416], 1
+	JE   LBB0_648
+
+LBB0_1037:
+	LONG $0x397d63c4; WORD $0x01c1             // vextracti128    xmm1, ymm8, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb    byte [r8 + rbx], xmm1, 3
+	QUAD $0x01000001802484f6                   // test    byte [rsp + 384], 1
+	JNE  LBB0_1038
+
+LBB0_649:
+	QUAD $0x01000001602484f6             // test    byte [rsp + 352], 1
+	QUAD $0x00036024846f7dc5; BYTE $0x00 // vmovdqa    ymm8, yword [rsp + 864]
+	JE   LBB0_650
+
+LBB0_1039:
+	LONG $0x16f9e3c4; WORD $0x01fb             // vpextrq    rbx, xmm7, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb    byte [r8 + rbx], xmm1, 5
+	QUAD $0x01000001402484f6                   // test    byte [rsp + 320], 1
+	JNE  LBB0_1040
+
+LBB0_651:
+	QUAD $0x01000000982484f6 // test    byte [rsp + 152], 1
+	JE   LBB0_652
+
+LBB0_1041:
+	LONG $0x397de3c4; WORD $0x01f9             // vextracti128    xmm1, ymm7, 1
+	LONG $0x16f9e3c4; WORD $0x01cb             // vpextrq    rbx, xmm1, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb    byte [r8 + rbx], xmm1, 7
+	LONG $0x01c1f641                           // test    r9b, 1
+	LONG $0x244c8b44; BYTE $0x10               // mov    r9d, dword [rsp + 16]
+	JNE  LBB0_1042
+
+LBB0_653:
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB0_654
+
+LBB0_1043:
+	LONG $0x16f9e3c4; WORD $0x01d1             // vpextrq    rcx, xmm2, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb    byte [r8 + rcx], xmm1, 9
+	LONG $0x01c3f641                           // test    r11b, 1
+	QUAD $0x00000130249c8b4c                   // mov    r11, qword [rsp + 304]
+	JNE  LBB0_1044
+
+LBB0_655:
+	WORD $0x01a8  // test    al, 1
+	JE   LBB0_656
+
+LBB0_1045:
+	LONG $0x397de3c4; WORD $0x01d1             // vextracti128    xmm1, ymm2, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb    byte [r8 + rcx], xmm1, 11
+	LONG $0x01c6f640                           // test    sil, 1
+	JNE  LBB0_1046
+
+LBB0_657:
+	LONG $0x01c2f641                     // test    r10b, 1
+	QUAD $0x00032024946ffdc5; BYTE $0x00 // vmovdqa    ymm2, yword [rsp + 800]
+	JE   LBB0_658
+
+LBB0_1047:
+	LONG $0x16f963c4; WORD $0x01f9             // vpextrq    rcx, xmm15, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb    byte [r8 + rcx], xmm1, 13
+	LONG $0x01c5f641                           // test    r13b, 1
+	LONG $0x24548b4c; BYTE $0x30               // mov    r10, qword [rsp + 48]
+	JNE  LBB0_1048
+
+LBB0_659:
+	LONG $0x01c6f641 // test    r14b, 1
+	JE   LBB0_25
+
+LBB0_1049:
+	LONG $0x397d63c4; WORD $0x01f9             // vextracti128    xmm1, ymm15, 1
+	LONG $0x16f9e3c4; WORD $0x01c9             // vpextrq    rcx, xmm1, 1
+	LONG $0x397de3c4; WORD $0x01e1             // vextracti128    xmm1, ymm4, 1
+	LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb    byte [r8 + rcx], xmm1, 15
+	JMP  LBB0_25
+
+LBB0_1050:
+	WORD $0x394d; BYTE $0xd4 // cmp    r12, r10
+	JNE  LBB0_1055
+
+LBB0_1051:
+	MOVQ 960(SP), SP
+	VZEROUPPER
+	RET
+
+LBB0_1052:
+	LONG $0x244c8b44; BYTE $0x10 // mov    r9d, dword [rsp + 16]
+	LONG $0x24548b4c; BYTE $0x30 // mov    r10, qword [rsp + 48]
+	JMP  LBB0_1055
+
+LBB0_1054:
+	LONG $0x244c8b44; BYTE $0x10 // mov    r9d, dword [rsp + 16]
+	JMP  LBB0_1055
diff --git a/go/parquet/internal/utils/unpack_bool_noasm.go b/go/parquet/internal/utils/unpack_bool_noasm.go
new file mode 100644
index 00000000000..a715366c641
--- /dev/null
+++ b/go/parquet/internal/utils/unpack_bool_noasm.go
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build noasm
+
+package utils
+
+// BytesToBools when built with the noasm tag will direct to the pure go implementation
+// for converting a bitmap to a slice of bools
+func BytesToBools(in []byte, out []bool) {
+	bytesToBoolsGo(in, out)
+}
diff --git a/go/parquet/internal/utils/unpack_bool_sse4.go b/go/parquet/internal/utils/unpack_bool_sse4.go
new file mode 100644
index 00000000000..85e4aa77df7
--- /dev/null
+++ b/go/parquet/internal/utils/unpack_bool_sse4.go
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noasm
+
+package utils
+
+import "unsafe"
+
+//go:noescape
+func _bytes_to_bools_sse4(in unsafe.Pointer, len int, out unsafe.Pointer, outlen int)
+
+// use SSE4 to SIMD accelerate the conversion of bitmap to boolean slice
+func bytesToBoolsSSE4(in []byte, out []bool) {
+	_bytes_to_bools_sse4(unsafe.Pointer(&in[0]), len(in), unsafe.Pointer(&out[0]), len(out))
+}
diff --git a/go/parquet/internal/utils/unpack_bool_sse4.s b/go/parquet/internal/utils/unpack_bool_sse4.s
new file mode 100644
index 00000000000..ac8acb9f4d2
--- /dev/null
+++ b/go/parquet/internal/utils/unpack_bool_sse4.s
@@ -0,0 +1,88 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+TEXT ·_bytes_to_bools_sse4(SB), $0-32
+
+	MOVQ in+0(FP), DI
+	MOVQ len+8(FP), SI
+	MOVQ out+16(FP), DX
+	MOVQ outlen+24(FP), CX
+
+	WORD $0xf685             // test    esi, esi
+	JLE  LBB0_5
+	WORD $0x8941; BYTE $0xf0 // mov    r8d, esi
+	LONG $0x03e0c149         // shl    r8, 3
+	WORD $0x3145; BYTE $0xd2 // xor    r10d, r10d
+	JMP  LBB0_2
+
+LBB0_4:
+	LONG $0x08c28349         // add    r10, 8
+	LONG $0x01c78348         // add    rdi, 1
+	WORD $0x394d; BYTE $0xd0 // cmp    r8, r10
+	JE   LBB0_5
+
+LBB0_2:
+	WORD $0x3941; BYTE $0xca // cmp    r10d, ecx
+	JGE  LBB0_4
+	WORD $0x8945; BYTE $0xd1 // mov    r9d, r10d
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0x0124             // and    al, 1
+	LONG $0x0a048842         // mov    byte [rdx + r9], al
+	WORD $0x894c; BYTE $0xce // mov    rsi, r9
+	LONG $0x01ce8348         // or    rsi, 1
+	WORD $0xce39             // cmp    esi, ecx
+	JGE  LBB0_4
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0xe8d0             // shr    al, 1
+	WORD $0x0124             // and    al, 1
+	WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
+	WORD $0x894c; BYTE $0xce // mov    rsi, r9
+	LONG $0x02ce8348         // or    rsi, 2
+	WORD $0xce39             // cmp    esi, ecx
+	JGE  LBB0_4
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0xe8c0; BYTE $0x02 // shr    al, 2
+	WORD $0x0124             // and    al, 1
+	WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
+	WORD $0x894c; BYTE $0xce // mov    rsi, r9
+	LONG $0x03ce8348         // or    rsi, 3
+	WORD $0xce39             // cmp    esi, ecx
+	JGE  LBB0_4
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0xe8c0; BYTE $0x03 // shr    al, 3
+	WORD $0x0124             // and    al, 1
+	WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
+	WORD $0x894c; BYTE $0xce // mov    rsi, r9
+	LONG $0x04ce8348         // or    rsi, 4
+	WORD $0xce39             // cmp    esi, ecx
+	JGE  LBB0_4
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0xe8c0; BYTE $0x04 // shr    al, 4
+	WORD $0x0124             // and    al, 1
+	WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
+	WORD $0x894c; BYTE $0xce // mov    rsi, r9
+	LONG $0x05ce8348         // or    rsi, 5
+	WORD $0xce39             // cmp    esi, ecx
+	JGE  LBB0_4
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0xe8c0; BYTE $0x05 // shr    al, 5
+	WORD $0x0124             // and    al, 1
+	WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
+	WORD $0x894c; BYTE $0xce // mov    rsi, r9
+	LONG $0x06ce8348         // or    rsi, 6
+	WORD $0xce39             // cmp    esi, ecx
+	JGE  LBB0_4
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0xe8c0; BYTE $0x06 // shr    al, 6
+	WORD $0x0124             // and    al, 1
+	WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
+	LONG $0x07c98349         // or    r9, 7
+	WORD $0x3941; BYTE $0xc9 // cmp    r9d, ecx
+	JGE  LBB0_4
+	WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
+	WORD $0xe8c0; BYTE $0x07 // shr    al, 7
+	LONG $0x0a048842         // mov    byte [rdx + r9], al
+	JMP  LBB0_4
+
+LBB0_5:
+	RET
diff --git a/go/parquet/internal/utils/write_utils.go b/go/parquet/internal/utils/write_utils.go
new file mode 100644
index 00000000000..28662c647e7
--- /dev/null
+++ b/go/parquet/internal/utils/write_utils.go
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import "io"
+
+// WriterTell is an interface that adds a Tell function to an io.Writer
+type WriterTell interface {
+	io.Writer
+	Tell() int64
+}
+
+// WriteCloserTell is an interface adding a Tell function to a WriteCloser
+// so if the underlying writer has a Close function, it is exposed and not
+// hidden.
+type WriteCloserTell interface {
+	io.WriteCloser
+	Tell() int64
+}
+
+// TellWrapper wraps any io.Writer to add a Tell function that tracks
+// the position based on calls to Write. It does not take into account
+// any calls to Seek or any Writes that don't go through the TellWrapper
+type TellWrapper struct {
+	io.Writer
+	pos int64
+}
+
+// Close makes TellWrapper an io.Closer so that calling Close
+// will also call Close on the wrapped writer if it has a Close function.
+func (w *TellWrapper) Close() error {
+	if closer, ok := w.Writer.(io.WriteCloser); ok {
+		return closer.Close()
+	}
+	return nil
+}
+
+func (w *TellWrapper) Tell() int64 { return w.pos }
+func (w *TellWrapper) Write(p []byte) (n int, err error) {
+	n, err = w.Writer.Write(p)
+	w.pos += int64(n)
+	return
+}
diff --git a/go/parquet/tools.go b/go/parquet/tools.go
new file mode 100644
index 00000000000..23d8b97dba8
--- /dev/null
+++ b/go/parquet/tools.go
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build tools
+
+package tools
+
+import (
+	_ "github.com/klauspost/asmfmt/cmd/asmfmt"
+	_ "github.com/minio/asm2plan9s"
+	_ "github.com/minio/c2goasm"
+)
diff --git a/go/parquet/types.go b/go/parquet/types.go
new file mode 100644
index 00000000000..0f29de08551
--- /dev/null
+++ b/go/parquet/types.go
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package parquet
+
+import (
+	"encoding/binary"
+	"reflect"
+	"time"
+	"unsafe"
+)
+
+const (
+	julianUnixEpoch int64 = 2440588
+	nanosPerDay     int64 = 3600 * 24 * 1000 * 1000 * 1000
+	// Int96SizeBytes is the number of bytes that make up an Int96
+	Int96SizeBytes int = 12
+)
+
+var (
+	// Int96Traits provides information about the Int96 type
+	Int96Traits int96Traits
+	// ByteArrayTraits provides information about the ByteArray type, which is just an []byte
+	ByteArrayTraits byteArrayTraits
+	// FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte
+	FixedLenByteArrayTraits fixedLenByteArrayTraits
+	// ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size()
+	ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size())
+	// FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size()
+	FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size())
+)
+
+// NewInt96 creates a new Int96 from the given 3 uint32 values.
+func NewInt96(v [3]uint32) (out Int96) {
+	binary.LittleEndian.PutUint32(out[0:], v[0])
+	binary.LittleEndian.PutUint32(out[4:], v[1])
+	binary.LittleEndian.PutUint32(out[8:], v[2])
+	return
+}
+
+// Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit
+// integer.
+type Int96 [12]byte
+
+// SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value
+func (i96 *Int96) SetNanoSeconds(nanos int64) {
+	binary.LittleEndian.PutUint64(i96[:8], uint64(nanos))
+}
+
+// String provides the string representation as a timestamp via converting to a time.Time
+// and then calling String
+func (i96 Int96) String() string {
+	return i96.ToTime().String()
+}
+
+// ToTime returns a go time.Time object that represents the same time instant as the given Int96 value
+func (i96 Int96) ToTime() time.Time {
+	nanos := binary.LittleEndian.Uint64(i96[:8])
+	jdays := binary.LittleEndian.Uint32(i96[8:])
+
+	nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos
+	t := time.Unix(0, int64(nanos))
+	return t.UTC()
+}
+
+type int96Traits struct{}
+
+func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n }
+
+func (int96Traits) CastFromBytes(b []byte) []Int96 {
+	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
+
+	var res []Int96
+	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
+	s.Data = h.Data
+	s.Len = h.Len / Int96SizeBytes
+	s.Cap = h.Cap / Int96SizeBytes
+
+	return res
+}
+
+func (int96Traits) CastToBytes(b []Int96) []byte {
+	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
+
+	var res []byte
+	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
+	s.Data = h.Data
+	s.Len = h.Len * Int96SizeBytes
+	s.Cap = h.Cap * Int96SizeBytes
+
+	return res
+}
+
+// ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice
+type ByteArray []byte
+
+// Len returns the current length of the ByteArray, equivalent to len(bytearray)
+func (b ByteArray) Len() int {
+	return len(b)
+}
+
+// String returns a string representation of the ByteArray
+func (b ByteArray) String() string {
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+type byteArrayTraits struct{}
+
+func (byteArrayTraits) BytesRequired(n int) int {
+	return ByteArraySizeBytes * n
+}
+
+func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray {
+	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
+
+	var res []ByteArray
+	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
+	s.Data = h.Data
+	s.Len = h.Len / ByteArraySizeBytes
+	s.Cap = h.Cap / ByteArraySizeBytes
+
+	return res
+}
+
+// FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice
+type FixedLenByteArray []byte
+
+// Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray)
+func (b FixedLenByteArray) Len() int {
+	return len(b)
+}
+
+// String returns a string representation of the FixedLenByteArray
+func (b FixedLenByteArray) String() string {
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+type fixedLenByteArrayTraits struct{}
+
+func (fixedLenByteArrayTraits) BytesRequired(n int) int {
+	return FixedLenByteArraySizeBytes * n
+}
+
+func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray {
+	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
+
+	var res []FixedLenByteArray
+	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
+	s.Data = h.Data
+	s.Len = h.Len / FixedLenByteArraySizeBytes
+	s.Cap = h.Cap / FixedLenByteArraySizeBytes
+
+	return res
+}
diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BinaryConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BinaryConsumer.java
index 7d7f741222c..8c5f61169d4 100644
--- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BinaryConsumer.java
+++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BinaryConsumer.java
@@ -26,8 +26,6 @@
 import org.apache.arrow.vector.BitVectorHelper;
 import org.apache.arrow.vector.VarBinaryVector;
 
-import io.netty.util.internal.PlatformDependent;
-
 /**
  * Consumer which consume binary type values from {@link ResultSet}.
  * Write the data to {@link org.apache.arrow.vector.VarBinaryVector}.
@@ -45,7 +43,7 @@ public static BinaryConsumer createConsumer(VarBinaryVector vector, int index, b
     }
   }
 
-  private static final int BUFFER_SIZE = 1024;
+  private final byte[] reuseBytes = new byte[1024];
 
   /**
    * Instantiate a BinaryConsumer.
@@ -62,23 +60,21 @@ public BinaryConsumer(VarBinaryVector vector, int index) {
    */
   public void consume(InputStream is) throws IOException {
     if (is != null) {
-
+      while (currentIndex >= vector.getValueCapacity()) {
+        vector.reallocValidityAndOffsetBuffers();
+      }
+      final int startOffset = vector.getStartOffset(currentIndex);
+      final ArrowBuf offsetBuffer = vector.getOffsetBuffer();
+      int dataLength = 0;
       int read;
-      byte[] bytes = new byte[BUFFER_SIZE];
-      int totalBytes = 0;
-
-      ArrowBuf dataBuffer = vector.getDataBuffer();
-      ArrowBuf offsetBuffer = vector.getOffsetBuffer();
-      int startIndex = offsetBuffer.getInt(currentIndex * 4);
-      while ((read = is.read(bytes)) != -1) {
-        while ((dataBuffer.writerIndex() + read) > dataBuffer.capacity()) {
+      while ((read = is.read(reuseBytes)) != -1) {
+        while (vector.getDataBuffer().capacity() < (startOffset + dataLength + read)) {
           vector.reallocDataBuffer();
         }
-        PlatformDependent.copyMemory(bytes, 0,
-                dataBuffer.memoryAddress() + startIndex + totalBytes, read);
-        totalBytes += read;
+        vector.getDataBuffer().setBytes(startOffset + dataLength, reuseBytes, 0, read);
+        dataLength += read;
       }
-      offsetBuffer.setInt((currentIndex + 1) * 4, startIndex + totalBytes);
+      offsetBuffer.setInt((currentIndex + 1) * VarBinaryVector.OFFSET_WIDTH, startOffset + dataLength);
       BitVectorHelper.setBit(vector.getValidityBuffer(), currentIndex);
       vector.setLastSet(currentIndex);
     }
@@ -113,7 +109,7 @@ public void consume(ResultSet resultSet) throws SQLException, IOException {
       if (!resultSet.wasNull()) {
         consume(is);
       }
-      currentIndex++;
+      moveWriterPosition();
     }
   }
 
@@ -133,7 +129,7 @@ public NonNullableBinaryConsumer(VarBinaryVector vector, int index) {
     public void consume(ResultSet resultSet) throws SQLException, IOException {
       InputStream is = resultSet.getBinaryStream(columnIndexInResultSet);
       consume(is);
-      currentIndex++;
+      moveWriterPosition();
     }
   }
 }
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/consumer/AbstractConsumerTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/consumer/AbstractConsumerTest.java
new file mode 100644
index 00000000000..96bac42214c
--- /dev/null
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/consumer/AbstractConsumerTest.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.adapter.jdbc.consumer;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.junit.After;
+import org.junit.Before;
+
+public abstract class AbstractConsumerTest {
+
+  protected BufferAllocator allocator;
+
+  @Before
+  public void setUp() {
+    allocator = new RootAllocator(Long.MAX_VALUE);
+  }
+
+  @After
+  public void tearDown() {
+    allocator.close();
+  }
+
+}
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/consumer/BinaryConsumerTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/consumer/BinaryConsumerTest.java
new file mode 100644
index 00000000000..a368023d490
--- /dev/null
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/consumer/BinaryConsumerTest.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.adapter.jdbc.consumer;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+import org.apache.arrow.vector.BaseValueVector;
+import org.apache.arrow.vector.VarBinaryVector;
+import org.junit.Test;
+
+public class BinaryConsumerTest extends AbstractConsumerTest {
+
+  private static final int INITIAL_VALUE_ALLOCATION = BaseValueVector.INITIAL_VALUE_ALLOCATION;
+  private static final int DEFAULT_RECORD_BYTE_COUNT = 8;
+
+  interface InputStreamConsumer {
+    void consume(BinaryConsumer consumer) throws IOException;
+  }
+
+  protected void assertConsume(boolean nullable, InputStreamConsumer dataConsumer, byte[][] expect) throws IOException {
+    try (final VarBinaryVector vector = new VarBinaryVector("binary", allocator)) {
+      BinaryConsumer consumer = BinaryConsumer.createConsumer(vector, 0, nullable);
+      dataConsumer.consume(consumer);
+      assertEquals(expect.length - 1, vector.getLastSet());
+      for (int i = 0; i < expect.length; i++) {
+        byte[] value = expect[i];
+        if (value == null) {
+          assertTrue(vector.isNull(i));
+        } else {
+          assertArrayEquals(expect[i], vector.get(i));
+        }
+      }
+    }
+  }
+
+  private byte[] createBytes(int length) {
+    byte[] bytes = new byte[length];
+    for (int i = 0; i < length; i++) {
+      bytes[i] = (byte) (i % 1024);
+    }
+    return bytes;
+  }
+
+
+  public void testConsumeInputStream(byte[][] values, boolean nullable) throws IOException {
+    assertConsume(nullable, binaryConsumer -> {
+      for (byte[] value : values) {
+        binaryConsumer.consume(new ByteArrayInputStream(value));
+        binaryConsumer.moveWriterPosition();
+      }
+    }, values);
+  }
+
+  @Test
+  public void testConsumeInputStream() throws IOException {
+    testConsumeInputStream(new byte[][]{
+        createBytes(DEFAULT_RECORD_BYTE_COUNT)
+    }, false);
+
+    testConsumeInputStream(new byte[][]{
+        createBytes(DEFAULT_RECORD_BYTE_COUNT),
+        createBytes(DEFAULT_RECORD_BYTE_COUNT)
+    }, false);
+
+    testConsumeInputStream(new byte[][]{
+        createBytes(DEFAULT_RECORD_BYTE_COUNT * 2),
+        createBytes(DEFAULT_RECORD_BYTE_COUNT),
+        createBytes(DEFAULT_RECORD_BYTE_COUNT)
+    }, false);
+
+    testConsumeInputStream(new byte[][]{
+        createBytes(INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT)
+    }, false);
+
+    testConsumeInputStream(new byte[][]{
+        createBytes(INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT * 10),
+    }, false);
+
+    testConsumeInputStream(new byte[][]{
+        createBytes(INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT),
+        createBytes(INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT)
+    }, false);
+
+    testConsumeInputStream(new byte[][]{
+        createBytes(INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT),
+        createBytes(DEFAULT_RECORD_BYTE_COUNT),
+        createBytes(INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT)
+    }, false);
+
+    byte[][] testRecords = new byte[INITIAL_VALUE_ALLOCATION * 2][];
+    for (int i = 0; i < testRecords.length; i++) {
+      testRecords[i] = createBytes(DEFAULT_RECORD_BYTE_COUNT);
+    }
+    testConsumeInputStream(testRecords, false);
+  }
+
+}
diff --git a/java/adapter/orc/CMakeLists.txt b/java/adapter/orc/CMakeLists.txt
index c6facacf465..e2d4655d79e 100644
--- a/java/adapter/orc/CMakeLists.txt
+++ b/java/adapter/orc/CMakeLists.txt
@@ -30,14 +30,14 @@ include(FindJNI)
 
 message("generating headers to ${JNI_HEADERS_DIR}")
 
-add_jar(
-        arrow_orc_java
+add_jar(arrow_orc_java
         src/main/java/org/apache/arrow/adapter/orc/OrcReaderJniWrapper.java
         src/main/java/org/apache/arrow/adapter/orc/OrcStripeReaderJniWrapper.java
         src/main/java/org/apache/arrow/adapter/orc/OrcMemoryJniWrapper.java
         src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java
         src/main/java/org/apache/arrow/adapter/orc/OrcRecordBatch.java
         src/main/java/org/apache/arrow/adapter/orc/OrcFieldNode.java
-        GENERATE_NATIVE_HEADERS arrow_orc_java-native
-        DESTINATION ${JNI_HEADERS_DIR}
-)
+        GENERATE_NATIVE_HEADERS
+        arrow_orc_java-native
+        DESTINATION
+        ${JNI_HEADERS_DIR})
diff --git a/java/compression/pom.xml b/java/compression/pom.xml
index 9a6ab3508ed..dc0a9586539 100644
--- a/java/compression/pom.xml
+++ b/java/compression/pom.xml
@@ -44,8 +44,9 @@
       <version>1.20</version>
     </dependency>
     <dependency>
-      <groupId>io.netty</groupId>
-      <artifactId>netty-common</artifactId>
-    </dependency>
+    <groupId>com.github.luben</groupId>
+    <artifactId>zstd-jni</artifactId>
+    <version>1.4.9-1</version>
+</dependency>
   </dependencies>
 </project>
diff --git a/java/compression/src/main/java/org/apache/arrow/compression/CommonsCompressionFactory.java b/java/compression/src/main/java/org/apache/arrow/compression/CommonsCompressionFactory.java
index 4becbbe78c9..867e9f418b2 100644
--- a/java/compression/src/main/java/org/apache/arrow/compression/CommonsCompressionFactory.java
+++ b/java/compression/src/main/java/org/apache/arrow/compression/CommonsCompressionFactory.java
@@ -21,7 +21,9 @@
 import org.apache.arrow.vector.compression.CompressionUtil;
 
 /**
- * A factory implementation based on Apache Commons library.
+ * Default implementation of factory supported LZ4 and ZSTD compression.
+ *
+ * // TODO(ARROW-12115): Rename this class.
  */
 public class CommonsCompressionFactory implements CompressionCodec.Factory {
 
@@ -32,6 +34,8 @@ public CompressionCodec createCodec(CompressionUtil.CodecType codecType) {
     switch (codecType) {
       case LZ4_FRAME:
         return new Lz4CompressionCodec();
+      case ZSTD:
+        return new ZstdCompressionCodec();
       default:
         throw new IllegalArgumentException("Compression type not supported: " + codecType);
     }
diff --git a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java
index af34a8fdd70..daa35b7e15b 100644
--- a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java
+++ b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java
@@ -25,128 +25,60 @@
 
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.memory.BufferAllocator;
-import org.apache.arrow.memory.util.MemoryUtil;
 import org.apache.arrow.util.Preconditions;
-import org.apache.arrow.vector.compression.CompressionCodec;
+import org.apache.arrow.vector.compression.AbstractCompressionCodec;
 import org.apache.arrow.vector.compression.CompressionUtil;
 import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorInputStream;
 import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorOutputStream;
 import org.apache.commons.compress.utils.IOUtils;
 
-import io.netty.util.internal.PlatformDependent;
-
 /**
  * Compression codec for the LZ4 algorithm.
  */
-public class Lz4CompressionCodec implements CompressionCodec {
+public class Lz4CompressionCodec extends AbstractCompressionCodec {
 
   @Override
-  public ArrowBuf compress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) {
+  protected ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) {
     Preconditions.checkArgument(uncompressedBuffer.writerIndex() <= Integer.MAX_VALUE,
-        "The uncompressed buffer size exceeds the integer limit");
-
-    if (uncompressedBuffer.writerIndex() == 0L) {
-      // shortcut for empty buffer
-      ArrowBuf compressedBuffer = allocator.buffer(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH);
-      compressedBuffer.setLong(0, 0);
-      compressedBuffer.writerIndex(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH);
-      uncompressedBuffer.close();
-      return compressedBuffer;
-    }
-
-    try {
-      ArrowBuf compressedBuffer = doCompress(allocator, uncompressedBuffer);
-      long compressedLength = compressedBuffer.writerIndex() - CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH;
-      if (compressedLength > uncompressedBuffer.writerIndex()) {
-        // compressed buffer is larger, send the raw buffer
-        compressedBuffer.close();
-        compressedBuffer = CompressionUtil.packageRawBuffer(allocator, uncompressedBuffer);
-      }
+        "The uncompressed buffer size exceeds the integer limit %s.", Integer.MAX_VALUE);
 
-      uncompressedBuffer.close();
-      return compressedBuffer;
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-  }
-
-  private ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) throws IOException {
     byte[] inBytes = new byte[(int) uncompressedBuffer.writerIndex()];
-    PlatformDependent.copyMemory(uncompressedBuffer.memoryAddress(), inBytes, 0, uncompressedBuffer.writerIndex());
+    uncompressedBuffer.getBytes(/*index=*/0, inBytes);
     ByteArrayOutputStream baos = new ByteArrayOutputStream();
     try (InputStream in = new ByteArrayInputStream(inBytes);
          OutputStream out = new FramedLZ4CompressorOutputStream(baos)) {
       IOUtils.copy(in, out);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
     }
 
     byte[] outBytes = baos.toByteArray();
 
     ArrowBuf compressedBuffer = allocator.buffer(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + outBytes.length);
-
-    long uncompressedLength = uncompressedBuffer.writerIndex();
-    if (!MemoryUtil.LITTLE_ENDIAN) {
-      uncompressedLength = Long.reverseBytes(uncompressedLength);
-    }
-    // first 8 bytes reserved for uncompressed length, according to the specification
-    compressedBuffer.setLong(0, uncompressedLength);
-
-    PlatformDependent.copyMemory(
-        outBytes, 0, compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, outBytes.length);
+    compressedBuffer.setBytes(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, outBytes);
     compressedBuffer.writerIndex(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + outBytes.length);
     return compressedBuffer;
   }
 
   @Override
-  public ArrowBuf decompress(BufferAllocator allocator, ArrowBuf compressedBuffer) {
+  protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBuffer) {
     Preconditions.checkArgument(compressedBuffer.writerIndex() <= Integer.MAX_VALUE,
-        "The compressed buffer size exceeds the integer limit");
+        "The compressed buffer size exceeds the integer limit %s", Integer.MAX_VALUE);
 
-    Preconditions.checkArgument(compressedBuffer.writerIndex() >= CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH,
-        "Not enough data to decompress.");
-
-    long decompressedLength = compressedBuffer.getLong(0);
-    if (!MemoryUtil.LITTLE_ENDIAN) {
-      decompressedLength = Long.reverseBytes(decompressedLength);
-    }
-
-    if (decompressedLength == 0L) {
-      // shortcut for empty buffer
-      compressedBuffer.close();
-      return allocator.getEmpty();
-    }
-
-    if (decompressedLength == CompressionUtil.NO_COMPRESSION_LENGTH) {
-      // no compression
-      return CompressionUtil.extractUncompressedBuffer(compressedBuffer);
-    }
-
-    try {
-      ArrowBuf decompressedBuffer = doDecompress(allocator, compressedBuffer);
-      compressedBuffer.close();
-      return decompressedBuffer;
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-  }
-
-  private ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBuffer) throws IOException {
-    long decompressedLength = compressedBuffer.getLong(0);
-    if (!MemoryUtil.LITTLE_ENDIAN) {
-      decompressedLength = Long.reverseBytes(decompressedLength);
-    }
+    long decompressedLength = readUncompressedLength(compressedBuffer);
 
     byte[] inBytes = new byte[(int) (compressedBuffer.writerIndex() - CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH)];
-    PlatformDependent.copyMemory(
-        compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, inBytes, 0, inBytes.length);
+    compressedBuffer.getBytes(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, inBytes);
     ByteArrayOutputStream out = new ByteArrayOutputStream((int) decompressedLength);
     try (InputStream in = new FramedLZ4CompressorInputStream(new ByteArrayInputStream(inBytes))) {
       IOUtils.copy(in, out);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
     }
 
     byte[] outBytes = out.toByteArray();
     ArrowBuf decompressedBuffer = allocator.buffer(outBytes.length);
-    PlatformDependent.copyMemory(outBytes, 0, decompressedBuffer.memoryAddress(), outBytes.length);
-    decompressedBuffer.writerIndex(decompressedLength);
+    decompressedBuffer.setBytes(/*index=*/0, outBytes);
     return decompressedBuffer;
   }
 
diff --git a/java/compression/src/main/java/org/apache/arrow/compression/ZstdCompressionCodec.java b/java/compression/src/main/java/org/apache/arrow/compression/ZstdCompressionCodec.java
new file mode 100644
index 00000000000..38717843ef8
--- /dev/null
+++ b/java/compression/src/main/java/org/apache/arrow/compression/ZstdCompressionCodec.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.compression;
+
+
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.compression.AbstractCompressionCodec;
+import org.apache.arrow.vector.compression.CompressionUtil;
+
+import com.github.luben.zstd.Zstd;
+
+/**
+ * Compression codec for the ZSTD algorithm.
+ */
+public class ZstdCompressionCodec extends AbstractCompressionCodec {
+
+  @Override
+  protected ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) {
+    long maxSize = Zstd.compressBound(uncompressedBuffer.writerIndex());
+    long dstSize = CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + maxSize;
+    ArrowBuf compressedBuffer = allocator.buffer(dstSize);
+    long bytesWritten = Zstd.compressUnsafe(
+                          compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, dstSize,
+                          /*src*/uncompressedBuffer.memoryAddress(), /*srcSize=*/uncompressedBuffer.writerIndex(),
+                          /*level=*/3);
+    if (Zstd.isError(bytesWritten)) {
+      compressedBuffer.close();
+      throw new RuntimeException("Error compressing: " + Zstd.getErrorName(bytesWritten));
+    }
+    compressedBuffer.writerIndex(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + bytesWritten);
+    return compressedBuffer;
+  }
+
+  @Override
+  protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBuffer) {
+    long decompressedLength = readUncompressedLength(compressedBuffer);
+    ArrowBuf uncompressedBuffer = allocator.buffer(decompressedLength);
+    long decompressedSize = Zstd.decompressUnsafe(uncompressedBuffer.memoryAddress(), decompressedLength,
+          /*src=*/compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH,
+          compressedBuffer.writerIndex() - CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH);
+    if (Zstd.isError(decompressedSize)) {
+      uncompressedBuffer.close();
+      throw new RuntimeException("Error decompressing: " + Zstd.getErrorName(decompressedLength));
+    }
+    if (decompressedLength != decompressedSize) {
+      uncompressedBuffer.close();
+      throw new RuntimeException("Expected != actual decompressed length: " + 
+                                 decompressedLength + " != " + decompressedSize);
+    }
+    uncompressedBuffer.writerIndex(decompressedLength);
+    return uncompressedBuffer;
+  }
+
+  @Override
+  public CompressionUtil.CodecType getCodecType() {
+    return CompressionUtil.CodecType.ZSTD;
+  }
+}
diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java
index 52f24e20533..1f6d64d4761 100644
--- a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java
+++ b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java
@@ -80,6 +80,10 @@ public static Collection<Object[]> getCodecs() {
 
       CompressionCodec lz4Codec = new Lz4CompressionCodec();
       params.add(new Object[]{lz4Codec.getCodecType(), len, lz4Codec});
+
+      CompressionCodec zstdCodec = new ZstdCompressionCodec();
+      params.add(new Object[]{zstdCodec.getCodecType(), len, zstdCodec});
+
     }
     return params;
   }
diff --git a/java/dataset/CMakeLists.txt b/java/dataset/CMakeLists.txt
index 2743e4a6041..07e2d0ae8fc 100644
--- a/java/dataset/CMakeLists.txt
+++ b/java/dataset/CMakeLists.txt
@@ -30,14 +30,14 @@ include(FindJNI)
 
 message("generating headers to ${JNI_HEADERS_DIR}")
 
-add_jar(
-        arrow_dataset_java
+add_jar(arrow_dataset_java
         src/main/java/org/apache/arrow/dataset/jni/JniLoader.java
         src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java
         src/main/java/org/apache/arrow/dataset/jni/NativeRecordBatchHandle.java
         src/main/java/org/apache/arrow/dataset/file/JniWrapper.java
         src/main/java/org/apache/arrow/dataset/jni/NativeMemoryPool.java
         src/main/java/org/apache/arrow/dataset/jni/ReservationListener.java
-        GENERATE_NATIVE_HEADERS arrow_dataset_java-native
-        DESTINATION ${JNI_HEADERS_DIR}
-)
+        GENERATE_NATIVE_HEADERS
+        arrow_dataset_java-native
+        DESTINATION
+        ${JNI_HEADERS_DIR})
diff --git a/java/dev/checkstyle/checkstyle.xml b/java/dev/checkstyle/checkstyle.xml
index 40c0455e78d..c27f382ddda 100644
--- a/java/dev/checkstyle/checkstyle.xml
+++ b/java/dev/checkstyle/checkstyle.xml
@@ -48,7 +48,9 @@
       <property name="file" value="${checkstyle.suppressions.file}"/>
     </module>
 
-    <module name="NewlineAtEndOfFile"/>
+    <module name="NewlineAtEndOfFile">
+        <property name="lineSeparator" value="lf" />
+    </module>
 
     <!-- Google style modules -->
 
diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java
index 007139e225e..08ef7f01bcd 100644
--- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java
+++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java
@@ -17,6 +17,7 @@
 
 package org.apache.arrow.gandiva.expression;
 
+import java.math.BigDecimal;
 import java.nio.charset.Charset;
 import java.util.Set;
 
@@ -33,33 +34,50 @@ public class InNode implements TreeNode {
 
   private final Set<Integer> intValues;
   private final Set<Long> longValues;
+  private final Set<BigDecimal> decimalValues;
   private final Set<String> stringValues;
   private final Set<byte[]> binaryValues;
   private final TreeNode input;
 
+  private final Integer precision;
+  private final Integer scale;
+
   private InNode(Set<Integer> values, Set<Long> longValues, Set<String> stringValues, Set<byte[]>
-          binaryValues, TreeNode node) {
+          binaryValues, Set<BigDecimal> decimalValues, Integer precision, Integer scale, TreeNode node) {
     this.intValues = values;
     this.longValues = longValues;
+    this.decimalValues = decimalValues;
+    this.precision = precision;
+    this.scale = scale;
     this.stringValues = stringValues;
     this.binaryValues = binaryValues;
     this.input = node;
   }
 
   public static InNode makeIntInExpr(TreeNode node, Set<Integer> intValues) {
-    return new InNode(intValues, null, null, null, node);
+    return new InNode(intValues,
+            null, null, null, null, null, null, node);
   }
 
   public static InNode makeLongInExpr(TreeNode node, Set<Long> longValues) {
-    return new InNode(null, longValues, null, null, node);
+    return new InNode(null, longValues,
+            null, null, null, null, null, node);
+  }
+
+  public static InNode makeDecimalInExpr(TreeNode node, Set<BigDecimal> decimalValues,
+                                         Integer precision, Integer scale) {
+    return new InNode(null, null, null, null,
+            decimalValues, precision, scale, node);
   }
 
   public static InNode makeStringInExpr(TreeNode node, Set<String> stringValues) {
-    return new InNode(null, null, stringValues, null, node);
+    return new InNode(null, null, stringValues, null,
+            null, null, null, node);
   }
 
   public static InNode makeBinaryInExpr(TreeNode node, Set<byte[]> binaryValues) {
-    return new InNode(null, null, null, binaryValues, node);
+    return new InNode(null, null, null, binaryValues,
+            null, null, null, node);
   }
 
   @Override
@@ -78,6 +96,11 @@ public GandivaTypes.TreeNode toProtobuf() throws GandivaException {
       longValues.stream().forEach(val -> longConstants.addLongValues(GandivaTypes.LongNode.newBuilder()
               .setValue(val).build()));
       inNode.setLongValues(longConstants.build());
+    } else if (decimalValues != null) {
+      GandivaTypes.DecimalConstants.Builder decimalConstants = GandivaTypes.DecimalConstants.newBuilder();
+      decimalValues.stream().forEach(val -> decimalConstants.addDecimalValues(GandivaTypes.DecimalNode.newBuilder()
+              .setValue(val.toPlainString()).setPrecision(precision).setScale(scale).build()));
+      inNode.setDecimalValues(decimalConstants.build());
     } else if (stringValues != null) {
       GandivaTypes.StringConstants.Builder stringConstants = GandivaTypes.StringConstants
               .newBuilder();
@@ -94,6 +117,5 @@ public GandivaTypes.TreeNode toProtobuf() throws GandivaException {
     GandivaTypes.TreeNode.Builder builder = GandivaTypes.TreeNode.newBuilder();
     builder.setInNode(inNode.build());
     return builder.build();
-
   }
 }
diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java
index 3803db73c84..067715c0ae1 100644
--- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java
+++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java
@@ -17,6 +17,7 @@
 
 package org.apache.arrow.gandiva.expression;
 
+import java.math.BigDecimal;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
@@ -202,6 +203,11 @@ public static TreeNode makeInExpressionBigInt(TreeNode resultNode,
     return InNode.makeLongInExpr(resultNode, longValues);
   }
 
+  public static TreeNode makeInExpressionDecimal(TreeNode resultNode,
+                                                 Set<BigDecimal> decimalValues, Integer precision, Integer scale) {
+    return InNode.makeDecimalInExpr(resultNode, decimalValues, precision, scale);
+  }
+
   public static TreeNode makeInExpressionString(TreeNode resultNode,
                                                 Set<String> stringValues) {
     return InNode.makeStringInExpr(resultNode, stringValues);
diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java
index 4c7825c9aee..4a36c0405bd 100644
--- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java
+++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java
@@ -20,8 +20,10 @@
 import java.math.BigDecimal;
 import java.time.Instant;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Random;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.arrow.gandiva.exceptions.GandivaException;
@@ -235,7 +237,7 @@ DecimalVector decimalVector(String[] values, int precision, int scale) {
     DecimalVector vector = new DecimalVector("decimal" + Math.random(), allocator, precision, scale);
     vector.allocateNew();
     for (int i = 0; i < values.length; i++) {
-      BigDecimal decimal = new BigDecimal(values[i]);
+      BigDecimal decimal = new BigDecimal(values[i]).setScale(scale);
       vector.setSafe(i, decimal);
     }
 
@@ -243,6 +245,15 @@ DecimalVector decimalVector(String[] values, int precision, int scale) {
     return vector;
   }
 
+  Set decimalSet(String[] values, Integer scale) {
+    Set<BigDecimal> decimalSet = new HashSet<>();
+    for (int i = 0; i < values.length; i++) {
+      decimalSet.add(new BigDecimal(values[i]).setScale(scale));
+    }
+
+    return decimalSet;
+  }
+
   VarCharVector varcharVector(String[] values) {
     VarCharVector vector = new VarCharVector("VarCharVector" + Math.random(), allocator);
     vector.allocateNew();
diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
index 85ac83b42da..adc410e4440 100644
--- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
+++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
@@ -22,11 +22,13 @@
 import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 
+import java.math.BigDecimal;
 import java.nio.charset.Charset;
 import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -39,6 +41,7 @@
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.Float8Vector;
 import org.apache.arrow.vector.IntVector;
 import org.apache.arrow.vector.ValueVector;
@@ -1222,6 +1225,62 @@ public void testInExpr() throws GandivaException, Exception {
     eval.close();
   }
 
+  @Test
+  public void testInExprDecimal() throws GandivaException, Exception {
+    Integer precision = 26;
+    Integer scale = 5;
+    ArrowType.Decimal decimal = new ArrowType.Decimal(precision, scale, 128);
+    Field c1 = Field.nullable("c1", decimal);
+
+    String[] values = new String[]{"1", "2", "3", "4"};
+    Set<BigDecimal> decimalSet = decimalSet(values, scale);
+    decimalSet.add(new BigDecimal(-0.0));
+    decimalSet.add(new BigDecimal(Long.MAX_VALUE));
+    decimalSet.add(new BigDecimal(Long.MIN_VALUE));
+    TreeNode inExpr =
+            TreeBuilder.makeInExpressionDecimal(TreeBuilder.makeField(c1),
+                    decimalSet, precision, scale);
+    ExpressionTree expr = TreeBuilder.makeExpression(inExpr,
+            Field.nullable("result", boolType));
+    Schema schema = new Schema(Lists.newArrayList(c1));
+    Projector eval = Projector.make(schema, Lists.newArrayList(expr));
+
+    int numRows = 16;
+    byte[] validity = new byte[]{(byte) 255, 0};
+    String[] c1Values =
+            new String[]{"1", "2", "3", "4", "-0.0", "6", "7", "8", "9", "10", "11", "12", "13", "14",
+                    String.valueOf(Long.MAX_VALUE),
+                    String.valueOf(Long.MIN_VALUE)};
+
+    DecimalVector c1Data = decimalVector(c1Values, precision, scale);
+    ArrowBuf c1Validity = buf(validity);
+
+    ArrowFieldNode fieldNode = new ArrowFieldNode(numRows, 0);
+    ArrowRecordBatch batch =
+            new ArrowRecordBatch(
+                    numRows,
+                    Lists.newArrayList(fieldNode, fieldNode),
+                    Lists.newArrayList(c1Validity, c1Data.getDataBuffer(), c1Data.getValidityBuffer()));
+
+    BitVector bitVector = new BitVector(EMPTY_SCHEMA_PATH, allocator);
+    bitVector.allocateNew(numRows);
+
+    List<ValueVector> output = new ArrayList<ValueVector>();
+    output.add(bitVector);
+    eval.evaluate(batch, output);
+
+    for (int i = 1; i < 5; i++) {
+      assertTrue(bitVector.getObject(i).booleanValue());
+    }
+    for (int i = 5; i < 16; i++) {
+      assertFalse(bitVector.getObject(i).booleanValue());
+    }
+
+    releaseRecordBatch(batch);
+    releaseValueVectors(output);
+    eval.close();
+  }
+
   @Test
   public void testInExprStrings() throws GandivaException, Exception {
     Field c1 = Field.nullable("c1", new ArrowType.Utf8());
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compression/AbstractCompressionCodec.java b/java/vector/src/main/java/org/apache/arrow/vector/compression/AbstractCompressionCodec.java
new file mode 100644
index 00000000000..39b32968d52
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compression/AbstractCompressionCodec.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.compression;
+
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.util.MemoryUtil;
+import org.apache.arrow.util.Preconditions;
+
+/**
+ * The base class for concrete compression codecs, providing
+ * common logic for all compression codecs.
+ */
+public abstract class AbstractCompressionCodec implements CompressionCodec {
+
+  @Override
+  public ArrowBuf compress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) {
+    if (uncompressedBuffer.writerIndex() == 0L) {
+      // shortcut for empty buffer
+      ArrowBuf compressedBuffer = allocator.buffer(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH);
+      compressedBuffer.setLong(0, 0);
+      compressedBuffer.writerIndex(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH);
+      uncompressedBuffer.close();
+      return compressedBuffer;
+    }
+
+    ArrowBuf compressedBuffer = doCompress(allocator, uncompressedBuffer);
+    long compressedLength = compressedBuffer.writerIndex() - CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH;
+    long uncompressedLength = uncompressedBuffer.writerIndex();
+
+    if (compressedLength > uncompressedLength) {
+      // compressed buffer is larger, send the raw buffer
+      compressedBuffer.close();
+      compressedBuffer = CompressionUtil.packageRawBuffer(allocator, uncompressedBuffer);
+    } else {
+      writeUncompressedLength(compressedBuffer, uncompressedLength);
+    }
+
+    uncompressedBuffer.close();
+    return compressedBuffer;
+  }
+
+  @Override
+  public ArrowBuf decompress(BufferAllocator allocator, ArrowBuf compressedBuffer) {
+    Preconditions.checkArgument(compressedBuffer.writerIndex() >= CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH,
+        "Not enough data to decompress.");
+
+    long decompressedLength = readUncompressedLength(compressedBuffer);
+
+    if (decompressedLength == 0L) {
+      // shortcut for empty buffer
+      compressedBuffer.close();
+      return allocator.getEmpty();
+    }
+
+    if (decompressedLength == CompressionUtil.NO_COMPRESSION_LENGTH) {
+      // no compression
+      return CompressionUtil.extractUncompressedBuffer(compressedBuffer);
+    }
+
+    ArrowBuf decompressedBuffer = doDecompress(allocator, compressedBuffer);
+    compressedBuffer.close();
+    return decompressedBuffer;
+  }
+
+  protected void writeUncompressedLength(ArrowBuf compressedBuffer, long uncompressedLength) {
+    if (!MemoryUtil.LITTLE_ENDIAN) {
+      uncompressedLength = Long.reverseBytes(uncompressedLength);
+    }
+    // first 8 bytes reserved for uncompressed length, according to the specification
+    compressedBuffer.setLong(0, uncompressedLength);
+  }
+
+  protected long readUncompressedLength(ArrowBuf compressedBuffer) {
+    long decompressedLength = compressedBuffer.getLong(0);
+    if (!MemoryUtil.LITTLE_ENDIAN) {
+      decompressedLength = Long.reverseBytes(decompressedLength);
+    }
+    return decompressedLength;
+  }
+
+  /**
+   * The method that actually performs the data compression.
+   * The layout of the returned compressed buffer is the compressed data,
+   * plus 8 bytes reserved at the beginning of the buffer for the uncompressed data size.
+   * <p>
+   *   Please note that this method is not responsible for releasing the uncompressed buffer.
+   * </p>
+   */
+  protected abstract ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBuffer);
+
+  /**
+   * The method that actually performs the data decompression.
+   * The layout of the compressed buffer is the compressed data,
+   * plus 8 bytes at the beginning of the buffer storing the uncompressed data size.
+   * <p>
+   *   Please note that this method is not responsible for releasing the compressed buffer.
+   * </p>
+   */
+  protected abstract ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBuffer);
+}
diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt
old mode 100755
new mode 100644
index 7f3f1d0c5fd..fb80670b1fd
--- a/matlab/CMakeLists.txt
+++ b/matlab/CMakeLists.txt
@@ -19,8 +19,7 @@ cmake_minimum_required(VERSION 3.2)
 set(CMAKE_CXX_STANDARD 11)
 
 set(MLARROW_VERSION "4.0.0-SNAPSHOT")
-string(REGEX MATCH
-  "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}")
+string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}")
 
 project(mlarrow VERSION "${MLARROW_BASE_VERSION}")
 
@@ -33,23 +32,29 @@ endif()
 ## Arrow is Required
 find_package(Arrow REQUIRED)
 
-## MATLAB is required to be installed to build MEX interfaces 
+## MATLAB is required to be installed to build MEX interfaces
 set(MATLAB_ADDITIONAL_VERSIONS "R2018a=9.4")
 find_package(Matlab REQUIRED MX_LIBRARY)
 
 # Build featherread mex file based on the arrow shared library
-matlab_add_mex(NAME featherreadmex
-               SRC src/featherreadmex.cc
-                   src/feather_reader.cc
-                   src/util/handle_status.cc
-                   src/util/unicode_conversion.cc
-               LINK_TO ${ARROW_SHARED_LIB})
+matlab_add_mex(NAME
+               featherreadmex
+               SRC
+               src/featherreadmex.cc
+               src/feather_reader.cc
+               src/util/handle_status.cc
+               src/util/unicode_conversion.cc
+               LINK_TO
+               ${ARROW_SHARED_LIB})
 target_include_directories(featherreadmex PRIVATE ${ARROW_INCLUDE_DIR})
 
 # Build featherwrite mex file based on the arrow shared library
-matlab_add_mex(NAME featherwritemex
-               SRC src/featherwritemex.cc
-                   src/feather_writer.cc
-                   src/util/handle_status.cc
-               LINK_TO ${ARROW_SHARED_LIB})
+matlab_add_mex(NAME
+               featherwritemex
+               SRC
+               src/featherwritemex.cc
+               src/feather_writer.cc
+               src/util/handle_status.cc
+               LINK_TO
+               ${ARROW_SHARED_LIB})
 target_include_directories(featherwritemex PRIVATE ${ARROW_INCLUDE_DIR})
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 16e9e84a435..0714aa412e1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -52,7 +52,11 @@ set(Python3_FIND_FRAMEWORK "LAST")
 set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true)
 
 set(CMAKE_MACOSX_RPATH 1)
-set(CMAKE_OSX_DEPLOYMENT_TARGET 10.9)
+if(DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+  set(CMAKE_OSX_DEPLOYMENT_TARGET $ENV{MACOSX_DEPLOYMENT_TARGET})
+else()
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 10.9)
+endif()
 
 # Generate a Clang compile_commands.json "compilation database" file for use
 # with various development tools, such as Vim's YouCompleteMe plugin.
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 3cb152aa381..1515bdcfd36 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -58,6 +58,17 @@ cdef wrap_scalar_aggregate_function(const shared_ptr[CFunction]& sp_func):
     return func
 
 
+cdef wrap_hash_aggregate_function(const shared_ptr[CFunction]& sp_func):
+    """
+    Wrap a C++ aggregate Function in a HashAggregateFunction object.
+    """
+    cdef HashAggregateFunction func = (
+        HashAggregateFunction.__new__(HashAggregateFunction)
+    )
+    func.init(sp_func)
+    return func
+
+
 cdef wrap_meta_function(const shared_ptr[CFunction]& sp_func):
     """
     Wrap a C++ meta Function in a MetaFunction object.
@@ -85,6 +96,8 @@ cdef wrap_function(const shared_ptr[CFunction]& sp_func):
         return wrap_vector_function(sp_func)
     elif c_kind == FunctionKind_SCALAR_AGGREGATE:
         return wrap_scalar_aggregate_function(sp_func)
+    elif c_kind == FunctionKind_HASH_AGGREGATE:
+        return wrap_hash_aggregate_function(sp_func)
     elif c_kind == FunctionKind_META:
         return wrap_meta_function(sp_func)
     else:
@@ -117,6 +130,16 @@ cdef wrap_scalar_aggregate_kernel(const CScalarAggregateKernel* c_kernel):
     return kernel
 
 
+cdef wrap_hash_aggregate_kernel(const CHashAggregateKernel* c_kernel):
+    if c_kernel == NULL:
+        raise ValueError('Kernel was NULL')
+    cdef HashAggregateKernel kernel = (
+        HashAggregateKernel.__new__(HashAggregateKernel)
+    )
+    kernel.init(c_kernel)
+    return kernel
+
+
 cdef class Kernel(_Weakrefable):
     """
     A kernel object.
@@ -165,6 +188,18 @@ cdef class ScalarAggregateKernel(Kernel):
                 .format(frombytes(self.kernel.signature.get().ToString())))
 
 
+cdef class HashAggregateKernel(Kernel):
+    cdef:
+        const CHashAggregateKernel* kernel
+
+    cdef void init(self, const CHashAggregateKernel* kernel) except *:
+        self.kernel = kernel
+
+    def __repr__(self):
+        return ("HashAggregateKernel<{}>"
+                .format(frombytes(self.kernel.signature.get().ToString())))
+
+
 FunctionDoc = namedtuple(
     "FunctionDoc",
     ("summary", "description", "arg_names", "options_class"))
@@ -190,8 +225,12 @@ cdef class Function(_Weakrefable):
       in each input.  Examples: dictionary encoding, sorting, extracting
       unique values...
 
-    * "aggregate" functions reduce the dimensionality of the inputs by
-      applying a reduction function.  Examples: sum, minmax, mode...
+    * "scalar_aggregate" functions reduce the dimensionality of the inputs by
+      applying a reduction function.  Examples: sum, min_max, mode...
+
+    * "hash_aggregate" functions apply a reduction function to an input
+      subdivided by grouping criteria.  They may not be directly called.
+      Examples: hash_sum, hash_min_max...
 
     * "meta" functions dispatch to other functions.
     """
@@ -249,6 +288,8 @@ cdef class Function(_Weakrefable):
             return 'vector'
         elif c_kind == FunctionKind_SCALAR_AGGREGATE:
             return 'scalar_aggregate'
+        elif c_kind == FunctionKind_HASH_AGGREGATE:
+            return 'hash_aggregate'
         elif c_kind == FunctionKind_META:
             return 'meta'
         else:
@@ -351,6 +392,25 @@ cdef class ScalarAggregateFunction(Function):
         return [wrap_scalar_aggregate_kernel(k) for k in kernels]
 
 
+cdef class HashAggregateFunction(Function):
+    cdef:
+        const CHashAggregateFunction* func
+
+    cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
+        Function.init(self, sp_func)
+        self.func = <const CHashAggregateFunction*> sp_func.get()
+
+    @property
+    def kernels(self):
+        """
+        The kernels implementing this function.
+        """
+        cdef vector[const CHashAggregateKernel*] kernels = (
+            self.func.kernels()
+        )
+        return [wrap_hash_aggregate_kernel(k) for k in kernels]
+
+
 cdef class MetaFunction(Function):
     cdef:
         const CMetaFunction* func
@@ -624,6 +684,26 @@ class TrimOptions(_TrimOptions):
         self._set_options(characters)
 
 
+cdef class _ReplaceSubstringOptions(FunctionOptions):
+    cdef:
+        unique_ptr[CReplaceSubstringOptions] replace_substring_options
+
+    cdef const CFunctionOptions* get_options(self) except NULL:
+        return self.replace_substring_options.get()
+
+    def _set_options(self, pattern, replacement, max_replacements):
+        self.replace_substring_options.reset(
+            new CReplaceSubstringOptions(tobytes(pattern),
+                                         tobytes(replacement),
+                                         max_replacements)
+        )
+
+
+class ReplaceSubstringOptions(_ReplaceSubstringOptions):
+    def __init__(self, pattern, replacement, max_replacements=-1):
+        self._set_options(pattern, replacement, max_replacements)
+
+
 cdef class _FilterOptions(FunctionOptions):
     cdef:
         unique_ptr[CFilterOptions] filter_options
diff --git a/python/pyarrow/_csv.pxd b/python/pyarrow/_csv.pxd
index 2d9d24aea57..f8e12f16bc8 100644
--- a/python/pyarrow/_csv.pxd
+++ b/python/pyarrow/_csv.pxd
@@ -21,9 +21,26 @@ from pyarrow.includes.libarrow cimport *
 from pyarrow.lib cimport _Weakrefable
 
 
+cdef class ConvertOptions(_Weakrefable):
+    cdef:
+        CCSVConvertOptions options
+
+    @staticmethod
+    cdef ConvertOptions wrap(CCSVConvertOptions options)
+
+
 cdef class ParseOptions(_Weakrefable):
     cdef:
         CCSVParseOptions options
 
     @staticmethod
     cdef ParseOptions wrap(CCSVParseOptions options)
+
+
+cdef class ReadOptions(_Weakrefable):
+    cdef:
+        CCSVReadOptions options
+        public object encoding
+
+    @staticmethod
+    cdef ReadOptions wrap(CCSVReadOptions options)
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index cce44d1d8c8..a98160cfa99 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -73,9 +73,6 @@ cdef class ReadOptions(_Weakrefable):
         The character encoding of the CSV data.  Columns that cannot
         decode using this encoding can still be read as Binary.
     """
-    cdef:
-        CCSVReadOptions options
-        public object encoding
 
     # Avoid mistakingly creating attributes
     __slots__ = ()
@@ -161,6 +158,40 @@ cdef class ReadOptions(_Weakrefable):
     def autogenerate_column_names(self, value):
         self.options.autogenerate_column_names = value
 
+    def equals(self, ReadOptions other):
+        return (
+            self.use_threads == other.use_threads and
+            self.block_size == other.block_size and
+            self.skip_rows == other.skip_rows and
+            self.column_names == other.column_names and
+            self.autogenerate_column_names ==
+            other.autogenerate_column_names and
+            self.encoding == other.encoding
+        )
+
+    @staticmethod
+    cdef ReadOptions wrap(CCSVReadOptions options):
+        out = ReadOptions()
+        out.options = options
+        out.encoding = 'utf8'  # No way to know this
+        return out
+
+    def __getstate__(self):
+        return (self.use_threads, self.block_size, self.skip_rows,
+                self.column_names, self.autogenerate_column_names,
+                self.encoding)
+
+    def __setstate__(self, state):
+        (self.use_threads, self.block_size, self.skip_rows,
+         self.column_names, self.autogenerate_column_names,
+         self.encoding) = state
+
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return False
+
 
 cdef class ParseOptions(_Weakrefable):
     """
@@ -320,6 +351,12 @@ cdef class ParseOptions(_Weakrefable):
          self.escape_char, self.newlines_in_values,
          self.ignore_empty_lines) = state
 
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return False
+
 
 cdef class _ISO8601(_Weakrefable):
     """
@@ -391,9 +428,6 @@ cdef class ConvertOptions(_Weakrefable):
         `column_types`, or null by default).
         This option is ignored if `include_columns` is empty.
     """
-    cdef:
-        CCSVConvertOptions options
-
     # Avoid mistakingly creating attributes
     __slots__ = ()
 
@@ -603,6 +637,48 @@ cdef class ConvertOptions(_Weakrefable):
 
         self.options.timestamp_parsers = move(c_parsers)
 
+    @staticmethod
+    cdef ConvertOptions wrap(CCSVConvertOptions options):
+        out = ConvertOptions()
+        out.options = options
+        return out
+
+    def equals(self, ConvertOptions other):
+        return (
+            self.check_utf8 == other.check_utf8 and
+            self.column_types == other.column_types and
+            self.null_values == other.null_values and
+            self.true_values == other.true_values and
+            self.false_values == other.false_values and
+            self.timestamp_parsers == other.timestamp_parsers and
+            self.strings_can_be_null == other.strings_can_be_null and
+            self.auto_dict_encode == other.auto_dict_encode and
+            self.auto_dict_max_cardinality ==
+            other.auto_dict_max_cardinality and
+            self.include_columns == other.include_columns and
+            self.include_missing_columns == other.include_missing_columns
+        )
+
+    def __getstate__(self):
+        return (self.check_utf8, self.column_types, self.null_values,
+                self.true_values, self.false_values, self.timestamp_parsers,
+                self.strings_can_be_null, self.auto_dict_encode,
+                self.auto_dict_max_cardinality, self.include_columns,
+                self.include_missing_columns)
+
+    def __setstate__(self, state):
+        (self.check_utf8, self.column_types, self.null_values,
+         self.true_values, self.false_values, self.timestamp_parsers,
+         self.strings_can_be_null, self.auto_dict_encode,
+         self.auto_dict_max_cardinality, self.include_columns,
+         self.include_missing_columns) = state
+
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return False
+
 
 cdef _get_reader(input_file, ReadOptions read_options,
                  shared_ptr[CInputStream]* out):
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 4f559f21e4c..b012f97c2e9 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -29,7 +29,7 @@ from pyarrow.lib cimport *
 from pyarrow.lib import frombytes, tobytes
 from pyarrow.includes.libarrow_dataset cimport *
 from pyarrow._fs cimport FileSystem, FileInfo, FileSelector
-from pyarrow._csv cimport ParseOptions
+from pyarrow._csv cimport ConvertOptions, ParseOptions, ReadOptions
 from pyarrow.util import _is_path_like, _stringify_path
 
 from pyarrow._parquet cimport (
@@ -101,8 +101,9 @@ cdef class Expression(_Weakrefable):
     - Check whether the expression is contained in a list of values with
       the ``pyarrow.dataset.Expression.isin()`` member function.
 
-    Examples:
+    Examples
     --------
+
     >>> import pyarrow.dataset as ds
     >>> (ds.field("a") < ds.scalar(3)) | (ds.field("b") > 7)
     <pyarrow.dataset.Expression ((a < 3:int64) or (b > 7:int64))>
@@ -354,7 +355,10 @@ cdef class Dataset(_Weakrefable):
         Parameters
         ----------
         columns : list of str, default None
-            List of columns to project. Order and duplicates will be preserved.
+            The columns to project. This can be a list of column names to
+            include (order and duplicates will be preserved), or a dictionary
+            with {new_column_name: expression} values for more advanced
+            projections.
             The columns will be passed down to Datasets and corresponding data
             fragments to avoid loading, copying, and deserializing columns
             that will not be required further down the compute chain.
@@ -377,10 +381,30 @@ cdef class Dataset(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
+        fragment_scan_options : FragmentScanOptions, default None
+            Options specific to a particular scan and fragment type, which
+            can change between different scans of the same dataset.
 
         Returns
         -------
         scan_tasks : iterator of ScanTask
+
+        Examples
+        --------
+        >>> import pyarrow.dataset as ds
+        >>> dataset = ds.dataset("path/to/dataset")
+
+        Selecting a subset of the columns:
+
+        >>> dataset.scan(columns=["A", "B"])
+
+        Projecting selected columns using an expression:
+
+        >>> dataset.scan(columns={"A_int": ds.field("A").cast("int64")})
+
+        Filtering rows while scanning:
+
+        >>> dataset.scan(filter=ds.field("A") > 0)
         """
         return self._scanner(**kwargs).scan()
 
@@ -714,6 +738,23 @@ cdef class FileFormat(_Weakrefable):
     def default_extname(self):
         return frombytes(self.format.type_name())
 
+    @property
+    def default_fragment_scan_options(self):
+        return FragmentScanOptions.wrap(
+            self.wrapped.get().default_fragment_scan_options)
+
+    @default_fragment_scan_options.setter
+    def default_fragment_scan_options(self, FragmentScanOptions options):
+        if options is None:
+            self.wrapped.get().default_fragment_scan_options =\
+                <shared_ptr[CFragmentScanOptions]>nullptr
+        else:
+            self._set_default_fragment_scan_options(options)
+
+    cdef _set_default_fragment_scan_options(self, FragmentScanOptions options):
+        raise ValueError(f"Cannot set fragment scan options for "
+                         f"'{options.type_name}' on {self.__class__.__name__}")
+
     def __eq__(self, other):
         try:
             return self.equals(other)
@@ -793,7 +834,10 @@ cdef class Fragment(_Weakrefable):
             it's Dataset's schema. If not specified this will use the
             Fragment's physical schema which might differ for each Fragment.
         columns : list of str, default None
-            List of columns to project. Order and duplicates will be preserved.
+            The columns to project. This can be a list of column names to
+            include (order and duplicates will be preserved), or a dictionary
+            with {new_column_name: expression} values for more advanced
+            projections.
             The columns will be passed down to Datasets and corresponding data
             fragments to avoid loading, copying, and deserializing columns
             that will not be required further down the compute chain.
@@ -816,6 +860,9 @@ cdef class Fragment(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
+        fragment_scan_options : FragmentScanOptions, default None
+            Options specific to a particular scan and fragment type, which
+            can change between different scans of the same dataset.
 
         Returns
         -------
@@ -966,6 +1013,45 @@ class RowGroupInfo:
         return self.id == other.id
 
 
+cdef class FragmentScanOptions(_Weakrefable):
+    """Scan options specific to a particular fragment and scan operation."""
+
+    cdef:
+        shared_ptr[CFragmentScanOptions] wrapped
+
+    def __init__(self):
+        _forbid_instantiation(self.__class__)
+
+    cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp):
+        self.wrapped = sp
+
+    @staticmethod
+    cdef wrap(const shared_ptr[CFragmentScanOptions]& sp):
+        type_name = frombytes(sp.get().type_name())
+
+        classes = {
+            'csv': CsvFragmentScanOptions,
+        }
+
+        class_ = classes.get(type_name, None)
+        if class_ is None:
+            raise TypeError(type_name)
+
+        cdef FragmentScanOptions self = class_.__new__(class_)
+        self.init(sp)
+        return self
+
+    @property
+    def type_name(self):
+        return frombytes(self.wrapped.get().type_name())
+
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return False
+
+
 cdef class ParquetFileFragment(FileFragment):
     """A Fragment representing a parquet file."""
 
@@ -1227,6 +1313,9 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
                 self._properties["allow_truncated_timestamps"]
             ),
             writer_engine_version="V2",
+            use_compliant_nested_type=(
+                self._properties["use_compliant_nested_type"]
+            )
         )
 
     cdef void init(self, const shared_ptr[CFileWriteOptions]& sp):
@@ -1244,6 +1333,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
             use_deprecated_int96_timestamps=False,
             coerce_timestamps=None,
             allow_truncated_timestamps=False,
+            use_compliant_nested_type=False,
         )
         self._set_properties()
         self._set_arrow_properties()
@@ -1363,10 +1453,18 @@ cdef class CsvFileFormat(FileFormat):
     cdef:
         CCsvFileFormat* csv_format
 
-    def __init__(self, ParseOptions parse_options=None):
+    # Avoid mistakingly creating attributes
+    __slots__ = ()
+
+    def __init__(self, ParseOptions parse_options=None,
+                 ConvertOptions convert_options=None,
+                 ReadOptions read_options=None):
         self.init(shared_ptr[CFileFormat](new CCsvFileFormat()))
         if parse_options is not None:
             self.parse_options = parse_options
+        if convert_options is not None or read_options is not None:
+            self.default_fragment_scan_options = CsvFragmentScanOptions(
+                convert_options=convert_options, read_options=read_options)
 
     cdef void init(self, const shared_ptr[CFileFormat]& sp):
         FileFormat.init(self, sp)
@@ -1383,12 +1481,68 @@ cdef class CsvFileFormat(FileFormat):
     def parse_options(self, ParseOptions parse_options not None):
         self.csv_format.parse_options = parse_options.options
 
+    cdef _set_default_fragment_scan_options(self, FragmentScanOptions options):
+        if options.type_name == 'csv':
+            self.csv_format.default_fragment_scan_options = options.wrapped
+        else:
+            super()._set_default_fragment_scan_options(options)
+
     def equals(self, CsvFileFormat other):
         return self.parse_options.equals(other.parse_options)
 
     def __reduce__(self):
         return CsvFileFormat, (self.parse_options,)
 
+    def __repr__(self):
+        return f"<CsvFileFormat parse_options={self.parse_options}>"
+
+
+cdef class CsvFragmentScanOptions(FragmentScanOptions):
+    """Scan-specific options for CSV fragments."""
+
+    cdef:
+        CCsvFragmentScanOptions* csv_options
+
+    # Avoid mistakingly creating attributes
+    __slots__ = ()
+
+    def __init__(self, ConvertOptions convert_options=None,
+                 ReadOptions read_options=None):
+        self.init(shared_ptr[CFragmentScanOptions](
+            new CCsvFragmentScanOptions()))
+        if convert_options is not None:
+            self.convert_options = convert_options
+        if read_options is not None:
+            self.read_options = read_options
+
+    cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp):
+        FragmentScanOptions.init(self, sp)
+        self.csv_options = <CCsvFragmentScanOptions*> sp.get()
+
+    @property
+    def convert_options(self):
+        return ConvertOptions.wrap(self.csv_options.convert_options)
+
+    @convert_options.setter
+    def convert_options(self, ConvertOptions convert_options not None):
+        self.csv_options.convert_options = convert_options.options
+
+    @property
+    def read_options(self):
+        return ReadOptions.wrap(self.csv_options.read_options)
+
+    @read_options.setter
+    def read_options(self, ReadOptions read_options not None):
+        self.csv_options.read_options = read_options.options
+
+    def equals(self, CsvFragmentScanOptions other):
+        return (self.convert_options.equals(other.convert_options) and
+                self.read_options.equals(other.read_options))
+
+    def __reduce__(self):
+        return CsvFragmentScanOptions, (self.convert_options,
+                                        self.read_options)
+
 
 cdef class Partitioning(_Weakrefable):
 
@@ -2177,36 +2331,95 @@ cdef class ScanTask(_Weakrefable):
         -------
         record_batches : iterator of RecordBatch
         """
+        # Return an explicit iterator object instead of using a
+        # generator so that this method is eagerly evaluated (a
+        # generator would mean no work gets done until the first
+        # iteration). This also works around a bug in Cython's
+        # generator.
+        cdef CRecordBatchIterator iterator
+        with nogil:
+            iterator = move(GetResultValue(self.task.Execute()))
+        return RecordBatchIterator.wrap(self, move(iterator))
+
+
+cdef class RecordBatchIterator(_Weakrefable):
+    """An iterator over a sequence of record batches."""
+    cdef:
+        ScanTask task
+        # Iterator is a non-POD type and Cython uses offsetof, leading
+        # to a compiler warning unless wrapped like so
+        shared_ptr[CRecordBatchIterator] iterator
+
+    def __init__(self):
+        _forbid_instantiation(self.__class__, subclasses_instead=False)
+
+    @staticmethod
+    cdef wrap(ScanTask task, CRecordBatchIterator iterator):
+        cdef RecordBatchIterator self = \
+            RecordBatchIterator.__new__(RecordBatchIterator)
+        self.task = task
+        self.iterator = make_shared[CRecordBatchIterator](move(iterator))
+        return self
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
         cdef shared_ptr[CRecordBatch] record_batch
         with nogil:
-            for maybe_batch in GetResultValue(self.task.Execute()):
-                record_batch = GetResultValue(move(maybe_batch))
-                with gil:
-                    yield pyarrow_wrap_batch(record_batch)
+            record_batch = GetResultValue(move(self.iterator.get().Next()))
+        if record_batch == NULL:
+            raise StopIteration
+        return pyarrow_wrap_batch(record_batch)
 
 
 _DEFAULT_BATCH_SIZE = 2**20
 
 
 cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr,
-                            list columns=None, Expression filter=None,
+                            object columns=None, Expression filter=None,
                             int batch_size=_DEFAULT_BATCH_SIZE,
                             bint use_threads=True,
-                            MemoryPool memory_pool=None) except *:
+                            MemoryPool memory_pool=None,
+                            FragmentScanOptions fragment_scan_options=None)\
+        except *:
     cdef:
         CScannerBuilder *builder
+        vector[CExpression] c_exprs
+
     builder = ptr.get()
 
     check_status(builder.Filter(_bind(
         filter, pyarrow_wrap_schema(builder.schema()))))
 
     if columns is not None:
-        check_status(builder.Project([tobytes(c) for c in columns]))
+        if isinstance(columns, dict):
+            for expr in columns.values():
+                if not isinstance(expr, Expression):
+                    raise TypeError(
+                        "Expected an Expression for a 'column' dictionary "
+                        "value, got {} instead".format(type(expr))
+                    )
+                c_exprs.push_back((<Expression> expr).unwrap())
+
+            check_status(
+                builder.Project(c_exprs, [tobytes(c) for c in columns.keys()])
+            )
+        elif isinstance(columns, list):
+            check_status(builder.ProjectColumns([tobytes(c) for c in columns]))
+        else:
+            raise ValueError(
+                "Expected a list or a dict for 'columns', "
+                "got {} instead.".format(type(columns))
+            )
 
     check_status(builder.BatchSize(batch_size))
     check_status(builder.UseThreads(use_threads))
     if memory_pool:
         check_status(builder.Pool(maybe_unbox_memory_pool(memory_pool)))
+    if fragment_scan_options:
+        check_status(
+            builder.FragmentScanOptions(fragment_scan_options.wrapped))
 
 
 cdef class Scanner(_Weakrefable):
@@ -2219,8 +2432,10 @@ cdef class Scanner(_Weakrefable):
     ----------
     dataset : Dataset
         Dataset to scan.
-    columns : list of str, default None
-        List of columns to project. Order and duplicates will be preserved.
+    columns : list of str or dict, default None
+        The columns to project. This can be a list of column names to include
+        (order and duplicates will be preserved), or a dictionary with
+        {new_column_name: expression} values for more advanced projections.
         The columns will be passed down to Datasets and corresponding data
         fragments to avoid loading, copying, and deserializing columns
         that will not be required further down the compute chain.
@@ -2268,8 +2483,9 @@ cdef class Scanner(_Weakrefable):
     @staticmethod
     def from_dataset(Dataset dataset not None,
                      bint use_threads=True, MemoryPool memory_pool=None,
-                     list columns=None, Expression filter=None,
-                     int batch_size=_DEFAULT_BATCH_SIZE):
+                     object columns=None, Expression filter=None,
+                     int batch_size=_DEFAULT_BATCH_SIZE,
+                     FragmentScanOptions fragment_scan_options=None):
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
             shared_ptr[CScannerBuilder] builder
@@ -2278,7 +2494,8 @@ cdef class Scanner(_Weakrefable):
         builder = make_shared[CScannerBuilder](dataset.unwrap(), options)
         _populate_builder(builder, columns=columns, filter=filter,
                           batch_size=batch_size, use_threads=use_threads,
-                          memory_pool=memory_pool)
+                          memory_pool=memory_pool,
+                          fragment_scan_options=fragment_scan_options)
 
         scanner = GetResultValue(builder.get().Finish())
         return Scanner.wrap(scanner)
@@ -2286,8 +2503,9 @@ cdef class Scanner(_Weakrefable):
     @staticmethod
     def from_fragment(Fragment fragment not None, Schema schema=None,
                       bint use_threads=True, MemoryPool memory_pool=None,
-                      list columns=None, Expression filter=None,
-                      int batch_size=_DEFAULT_BATCH_SIZE):
+                      object columns=None, Expression filter=None,
+                      int batch_size=_DEFAULT_BATCH_SIZE,
+                      FragmentScanOptions fragment_scan_options=None):
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
             shared_ptr[CScannerBuilder] builder
@@ -2299,7 +2517,8 @@ cdef class Scanner(_Weakrefable):
                                                fragment.unwrap(), options)
         _populate_builder(builder, columns=columns, filter=filter,
                           batch_size=batch_size, use_threads=use_threads,
-                          memory_pool=memory_pool)
+                          memory_pool=memory_pool,
+                          fragment_scan_options=fragment_scan_options)
 
         scanner = GetResultValue(builder.get().Finish())
         return Scanner.wrap(scanner)
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 8c0f5a9fcec..8fa1c855b3e 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -54,7 +54,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
         ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY"
 
     enum ParquetLogicalTypeId" parquet::LogicalType::Type::type":
-        ParquetLogicalType_UNKNOWN" parquet::LogicalType::Type::UNKNOWN"
+        ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED"
         ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING"
         ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP"
         ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST"
@@ -382,6 +382,8 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
             Builder* allow_truncated_timestamps()
             Builder* disallow_truncated_timestamps()
             Builder* store_schema()
+            Builder* enable_compliant_nested_types()
+            Builder* disable_compliant_nested_types()
             Builder* set_engine_version(ArrowWriterEngineVersion version)
             shared_ptr[ArrowWriterProperties] build()
         c_bool support_deprecated_int96_timestamps()
@@ -501,7 +503,8 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
     use_deprecated_int96_timestamps=*,
     coerce_timestamps=*,
     allow_truncated_timestamps=*,
-    writer_engine_version=*) except *
+    writer_engine_version=*,
+    use_compliant_nested_type=*) except *
 
 cdef class ParquetSchema(_Weakrefable):
     cdef:
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 53bbee63165..67c1c5a4fc8 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -809,7 +809,7 @@ cdef physical_type_name_from_enum(ParquetType type_):
 
 cdef logical_type_name_from_enum(ParquetLogicalTypeId type_):
     return {
-        ParquetLogicalType_UNKNOWN: 'UNKNOWN',
+        ParquetLogicalType_UNDEFINED: 'UNDEFINED',
         ParquetLogicalType_STRING: 'STRING',
         ParquetLogicalType_MAP: 'MAP',
         ParquetLogicalType_LIST: 'LIST',
@@ -1265,7 +1265,8 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
         use_deprecated_int96_timestamps=False,
         coerce_timestamps=None,
         allow_truncated_timestamps=False,
-        writer_engine_version=None) except *:
+        writer_engine_version=None,
+        use_compliant_nested_type=False) except *:
     """Arrow writer properties"""
     cdef:
         shared_ptr[ArrowWriterProperties] arrow_properties
@@ -1299,6 +1300,13 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
     else:
         arrow_props.disallow_truncated_timestamps()
 
+    # use_compliant_nested_type
+
+    if use_compliant_nested_type:
+        arrow_props.enable_compliant_nested_types()
+    else:
+        arrow_props.disable_compliant_nested_types()
+
     # writer_engine_version
 
     if writer_engine_version == "V1":
@@ -1328,6 +1336,7 @@ cdef class ParquetWriter(_Weakrefable):
         object compression
         object compression_level
         object data_page_version
+        object use_compliant_nested_type
         object version
         object write_statistics
         object writer_engine_version
@@ -1345,7 +1354,8 @@ cdef class ParquetWriter(_Weakrefable):
                   compression_level=None,
                   use_byte_stream_split=False,
                   writer_engine_version=None,
-                  data_page_version=None):
+                  data_page_version=None,
+                  use_compliant_nested_type=False):
         cdef:
             shared_ptr[WriterProperties] properties
             shared_ptr[ArrowWriterProperties] arrow_properties
@@ -1377,7 +1387,8 @@ cdef class ParquetWriter(_Weakrefable):
             use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
             coerce_timestamps=coerce_timestamps,
             allow_truncated_timestamps=allow_truncated_timestamps,
-            writer_engine_version=writer_engine_version
+            writer_engine_version=writer_engine_version,
+            use_compliant_nested_type=use_compliant_nested_type
         )
 
         pool = maybe_unbox_memory_pool(memory_pool)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index a832b00b1eb..748a64e183a 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -152,6 +152,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
     -------
     array : pyarrow.Array or pyarrow.ChunkedArray
         A ChunkedArray instead of an Array is returned if:
+
         - the object data overflowed binary storage.
         - the object's ``__arrow_array__`` protocol method returned a chunked
           array.
@@ -722,6 +723,10 @@ cdef class _PandasConvertible(_Weakrefable):
             memory while converting the Arrow object to pandas. If you use the
             object after calling to_pandas with this option it will crash your
             program.
+
+            Note that you may not see always memory usage improvements. For
+            example, if multiple columns share an underlying allocation,
+            memory can't be freed until all columns are converted.
         types_mapper : function, default None
             A function mapping a pyarrow DataType to a pandas ExtensionDtype.
             This can be used to override the default pandas type for conversion
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 3d7f5ecb4c3..1b46a08c402 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -19,6 +19,8 @@
     Function,
     FunctionOptions,
     FunctionRegistry,
+    HashAggregateFunction,
+    HashAggregateKernel,
     Kernel,
     ScalarAggregateFunction,
     ScalarAggregateKernel,
@@ -40,6 +42,7 @@
     PartitionNthOptions,
     ProjectOptions,
     QuantileOptions,
+    ReplaceSubstringOptions,
     SetLookupOptions,
     SortOptions,
     StrptimeOptions,
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index a2cb87a1f7a..195d414b047 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -22,6 +22,7 @@
 
 from pyarrow._dataset import (  # noqa
     CsvFileFormat,
+    CsvFragmentScanOptions,
     Expression,
     Dataset,
     DatasetFactory,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 9afe4d1e720..ebdcd08334c 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1729,6 +1729,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
             " arrow::compute::ScalarAggregateKernel"(CKernel):
         pass
 
+    cdef cppclass CHashAggregateKernel \
+            " arrow::compute::HashAggregateKernel"(CKernel):
+        pass
+
     cdef cppclass CArity" arrow::compute::Arity":
         int num_args
         c_bool is_varargs
@@ -1738,6 +1742,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         FunctionKind_VECTOR" arrow::compute::Function::VECTOR"
         FunctionKind_SCALAR_AGGREGATE \
             " arrow::compute::Function::SCALAR_AGGREGATE"
+        FunctionKind_HASH_AGGREGATE \
+            " arrow::compute::Function::HASH_AGGREGATE"
         FunctionKind_META \
             " arrow::compute::Function::META"
 
@@ -1771,6 +1777,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
             (CFunction):
         vector[const CScalarAggregateKernel*] kernels() const
 
+    cdef cppclass CHashAggregateFunction\
+            " arrow::compute::HashAggregateFunction"\
+            (CFunction):
+        vector[const CHashAggregateKernel*] kernels() const
+
     cdef cppclass CMetaFunction" arrow::compute::MetaFunction"(CFunction):
         pass
 
@@ -1804,6 +1815,14 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
                              c_bool reverse)
         c_string pattern
 
+    cdef cppclass CReplaceSubstringOptions \
+            "arrow::compute::ReplaceSubstringOptions"(CFunctionOptions):
+        CReplaceSubstringOptions(c_string pattern, c_string replacement,
+                                 int64_t max_replacements)
+        c_string pattern
+        c_string replacement
+        int64_t max_replacements
+
     cdef cppclass CCastOptions" arrow::compute::CastOptions"(CFunctionOptions):
         CCastOptions()
         CCastOptions(c_bool safe)
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index f37f49f463d..f7f2a142001 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -59,6 +59,9 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         @staticmethod
         shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema)
 
+    cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions":
+        c_string type_name() const
+
     ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \
         "arrow::dataset::ScanTaskIterator"
 
@@ -96,11 +99,14 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
                         shared_ptr[CScanOptions] scan_options)
         CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment],
                         shared_ptr[CScanOptions] scan_options)
-        CStatus Project(const vector[c_string]& columns)
+        CStatus ProjectColumns "Project"(const vector[c_string]& columns)
+        CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns)
         CStatus Filter(CExpression filter)
         CStatus UseThreads(c_bool use_threads)
         CStatus Pool(CMemoryPool* pool)
         CStatus BatchSize(int64_t batch_size)
+        CStatus FragmentScanOptions(
+            shared_ptr[CFragmentScanOptions] fragment_scan_options)
         CResult[shared_ptr[CScanner]] Finish()
         shared_ptr[CSchema] schema() const
 
@@ -164,6 +170,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         c_string type_name() const
 
     cdef cppclass CFileFormat "arrow::dataset::FileFormat":
+        shared_ptr[CFragmentScanOptions] default_fragment_scan_options
         c_string type_name() const
         CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const
         CResult[shared_ptr[CFileFragment]] MakeFragment(
@@ -252,6 +259,11 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
             CFileFormat):
         CCSVParseOptions parse_options
 
+    cdef cppclass CCsvFragmentScanOptions \
+            "arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions):
+        CCSVConvertOptions convert_options
+        CCSVReadOptions read_options
+
     cdef cppclass CPartitioning "arrow::dataset::Partitioning":
         c_string type_name() const
         CResult[CExpression] Parse(const c_string & path) const
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 3ce10f3b999..8880179e7c4 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -536,45 +536,62 @@ cdef object pyarrow_wrap_metadata(
 #
 # Public Cython API for 3rd party code
 #
+# If you add functions to this list, please also update
+# `cpp/src/arrow/python/pyarrow.{h, cc}`
+#
+
+# Wrapping C++ -> Python
 
-cdef public object pyarrow_wrap_scalar(const shared_ptr[CScalar]& sp_scalar)
-cdef public object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array)
-cdef public object pyarrow_wrap_chunked_array(
-    const shared_ptr[CChunkedArray]& sp_array)
-cdef public object pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& cbatch)
 cdef public object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf)
-cdef public object pyarrow_wrap_data_type(const shared_ptr[CDataType]& type)
-cdef public object pyarrow_wrap_field(const shared_ptr[CField]& field)
 cdef public object pyarrow_wrap_resizable_buffer(
     const shared_ptr[CResizableBuffer]& buf)
+
+cdef public object pyarrow_wrap_data_type(const shared_ptr[CDataType]& type)
+cdef public object pyarrow_wrap_field(const shared_ptr[CField]& field)
 cdef public object pyarrow_wrap_schema(const shared_ptr[CSchema]& type)
-cdef public object pyarrow_wrap_table(const shared_ptr[CTable]& ctable)
-cdef public object pyarrow_wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
+
+cdef public object pyarrow_wrap_scalar(const shared_ptr[CScalar]& sp_scalar)
+
+cdef public object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array)
+cdef public object pyarrow_wrap_chunked_array(
+    const shared_ptr[CChunkedArray]& sp_array)
+
 cdef public object pyarrow_wrap_sparse_coo_tensor(
     const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor)
-cdef public object pyarrow_wrap_sparse_csr_matrix(
-    const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor)
 cdef public object pyarrow_wrap_sparse_csc_matrix(
     const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor)
 cdef public object pyarrow_wrap_sparse_csf_tensor(
     const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor)
+cdef public object pyarrow_wrap_sparse_csr_matrix(
+    const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor)
+cdef public object pyarrow_wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
+
+cdef public object pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& cbatch)
+cdef public object pyarrow_wrap_table(const shared_ptr[CTable]& ctable)
+
+# Unwrapping Python -> C++
 
-cdef public shared_ptr[CScalar] pyarrow_unwrap_scalar(object scalar)
-cdef public shared_ptr[CArray] pyarrow_unwrap_array(object array)
-cdef public shared_ptr[CChunkedArray] pyarrow_unwrap_chunked_array(
-    object array)
-cdef public shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch)
 cdef public shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer)
+
 cdef public shared_ptr[CDataType] pyarrow_unwrap_data_type(object data_type)
 cdef public shared_ptr[CField] pyarrow_unwrap_field(object field)
 cdef public shared_ptr[CSchema] pyarrow_unwrap_schema(object schema)
-cdef public shared_ptr[CTable] pyarrow_unwrap_table(object table)
-cdef public shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor)
+
+cdef public shared_ptr[CScalar] pyarrow_unwrap_scalar(object scalar)
+
+cdef public shared_ptr[CArray] pyarrow_unwrap_array(object array)
+cdef public shared_ptr[CChunkedArray] pyarrow_unwrap_chunked_array(
+    object array)
+
 cdef public shared_ptr[CSparseCOOTensor] pyarrow_unwrap_sparse_coo_tensor(
     object sparse_tensor)
-cdef public shared_ptr[CSparseCSRMatrix] pyarrow_unwrap_sparse_csr_matrix(
-    object sparse_tensor)
 cdef public shared_ptr[CSparseCSCMatrix] pyarrow_unwrap_sparse_csc_matrix(
     object sparse_tensor)
 cdef public shared_ptr[CSparseCSFTensor] pyarrow_unwrap_sparse_csf_tensor(
     object sparse_tensor)
+cdef public shared_ptr[CSparseCSRMatrix] pyarrow_unwrap_sparse_csr_matrix(
+    object sparse_tensor)
+cdef public shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor)
+
+cdef public shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch)
+cdef public shared_ptr[CTable] pyarrow_unwrap_table(object table)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 935f495d7c8..91b38b8426c 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -18,6 +18,10 @@
 
 import ast
 from collections.abc import Sequence
+from concurrent import futures
+# import threading submodule upfront to avoid partially initialized
+# module bug (ARROW-11983)
+import concurrent.futures.thread  # noqa
 from copy import deepcopy
 from itertools import zip_longest
 import json
@@ -590,8 +594,6 @@ def _can_definitely_zero_copy(arr):
         arrays = [convert_column(c, f)
                   for c, f in zip(columns_to_convert, convert_fields)]
     else:
-        from concurrent import futures
-
         arrays = []
         with futures.ThreadPoolExecutor(nthreads) as executor:
             for c, f in zip(columns_to_convert, convert_fields):
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index a15dc65290c..eac9bc8df11 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -541,6 +541,30 @@ def _sanitize_table(table, new_schema, flavor):
     The serialized Parquet data page format version to write, defaults to
     1.0. This does not impact the file schema logical types and Arrow to
     Parquet type casting behavior; for that use the "version" option.
+use_compliant_nested_type: bool, default False
+    Whether to write compliant Parquet nested type (lists) as defined
+    `here <https://github.com/apache/parquet-format/blob/master/
+    LogicalTypes.md#nested-types>`_, defaults to ``False``.
+    For ``use_compliant_nested_type=True``, this will write into a list
+    with 3-level structure where the middle level, named ``list``,
+    is a repeated group with a single field named ``element``
+    ::
+        <list-repetition> group <name> (LIST) {
+            repeated group list {
+                  <element-repetition> <element-type> element;
+            }
+        }
+
+    For ``use_compliant_nested_type=False``, this will also write into a list
+    with 3-level structure, where the name of the single field of the middle
+    level ``list`` is taken from the element name for nested columns in Arrow,
+    which defaults to ``item``
+    ::
+        <list-repetition> group <name> (LIST) {
+            repeated group list {
+                <element-repetition> <element-type> item;
+            }
+        }
 """
 
 
@@ -572,6 +596,7 @@ def __init__(self, where, schema, filesystem=None,
                  use_byte_stream_split=False,
                  writer_engine_version=None,
                  data_page_version='1.0',
+                 use_compliant_nested_type=False,
                  **options):
         if use_deprecated_int96_timestamps is None:
             # Use int96 timestamps for Spark
@@ -622,6 +647,7 @@ def __init__(self, where, schema, filesystem=None,
             use_byte_stream_split=use_byte_stream_split,
             writer_engine_version=engine_version,
             data_page_version=data_page_version,
+            use_compliant_nested_type=use_compliant_nested_type,
             **options)
         self.is_open = True
 
@@ -1775,6 +1801,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 compression_level=None,
                 use_byte_stream_split=False,
                 data_page_version='1.0',
+                use_compliant_nested_type=False,
                 **kwargs):
     row_group_size = kwargs.pop('chunk_size', row_group_size)
     use_int96 = use_deprecated_int96_timestamps
@@ -1794,6 +1821,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 compression_level=compression_level,
                 use_byte_stream_split=use_byte_stream_split,
                 data_page_version=data_page_version,
+                use_compliant_nested_type=use_compliant_nested_type,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:
diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py
new file mode 100644
index 00000000000..804f3738f12
--- /dev/null
+++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py
@@ -0,0 +1,113 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests.parquet.common import parametrize_legacy_dataset
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import (_read_table,
+                                              _check_roundtrip)
+except ImportError:
+    pq = None
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
+except ImportError:
+    pd = tm = None
+
+# Tests for ARROW-11497
+_test_data_simple = [
+    {'items': [1, 2]},
+    {'items': [0]},
+]
+
+_test_data_complex = [
+    {'items': [{'name': 'elem1', 'value': '1'},
+               {'name': 'elem2', 'value': '2'}]},
+    {'items': [{'name': 'elem1', 'value': '0'}]},
+]
+
+parametrize_test_data = pytest.mark.parametrize(
+    "test_data", [_test_data_simple, _test_data_complex])
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+@parametrize_test_data
+def test_write_compliant_nested_type_enable(tempdir,
+                                            use_legacy_dataset, test_data):
+    # prepare dataframe for testing
+    df = pd.DataFrame(data=test_data)
+    # verify that we can read/write pandas df with new flag
+    _roundtrip_pandas_dataframe(df,
+                                write_kwargs={
+                                    'use_compliant_nested_type': True},
+                                use_legacy_dataset=use_legacy_dataset)
+
+    # Write to a parquet file with compliant nested type
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+    with pq.ParquetWriter(path, table.schema,
+                          use_compliant_nested_type=True,
+                          version='2.0') as writer:
+        writer.write_table(table)
+    # Read back as a table
+    new_table = _read_table(path)
+    # Validate that "items" columns compliant to Parquet nested format
+    # Should be like this: list<element: struct<name: string, value: string>>
+    assert isinstance(new_table.schema.types[0], pa.ListType)
+    assert new_table.schema.types[0].value_field.name == 'element'
+
+    # Verify that the new table can be read/written correctly
+    _check_roundtrip(new_table,
+                     use_legacy_dataset=use_legacy_dataset,
+                     use_compliant_nested_type=True)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+@parametrize_test_data
+def test_write_compliant_nested_type_disable(tempdir,
+                                             use_legacy_dataset, test_data):
+    # prepare dataframe for testing
+    df = pd.DataFrame(data=test_data)
+    # verify that we can read/write with new flag disabled (default behaviour)
+    _roundtrip_pandas_dataframe(df, write_kwargs={},
+                                use_legacy_dataset=use_legacy_dataset)
+
+    # Write to a parquet file while disabling compliant nested type
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+    with pq.ParquetWriter(path, table.schema, version='2.0') as writer:
+        writer.write_table(table)
+    new_table = _read_table(path)
+
+    # Validate that "items" columns is not compliant to Parquet nested format
+    # Should be like this: list<item: struct<name: string, value: string>>
+    assert isinstance(new_table.schema.types[0], pa.ListType)
+    assert new_table.schema.types[0].value_field.name == 'item'
+
+    # Verify that the new table can be read/written correctly
+    _check_roundtrip(new_table,
+                     use_legacy_dataset=use_legacy_dataset,
+                     use_compliant_nested_type=False)
diff --git a/python/pyarrow/tests/pyarrow_cython_example.pyx b/python/pyarrow/tests/pyarrow_cython_example.pyx
index 160c1518b05..08f5e17a980 100644
--- a/python/pyarrow/tests/pyarrow_cython_example.pyx
+++ b/python/pyarrow/tests/pyarrow_cython_example.pyx
@@ -36,3 +36,20 @@ def make_null_array(length):
     cdef shared_ptr[CArray] null_array
     null_array.reset(new CNullArray(length))
     return pyarrow_wrap_array(null_array)
+
+
+def cast_scalar(scalar, to_type):
+    cdef:
+        shared_ptr[CScalar] c_scalar
+        shared_ptr[CDataType] c_type
+        CResult[shared_ptr[CScalar]] c_result
+
+    c_scalar = pyarrow_unwrap_scalar(scalar)
+    if c_scalar.get() == NULL:
+        raise TypeError("not a scalar")
+    c_type = pyarrow_unwrap_data_type(to_type)
+    if c_type.get() == NULL:
+        raise TypeError("not a type")
+    c_result = c_scalar.get().CastTo(c_type)
+    c_scalar = GetResultValue(c_result)
+    return pyarrow_wrap_scalar(c_scalar)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 3a7e8e78fd2..01ee2977fec 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -854,8 +854,8 @@ def test_union_from_dense():
     int64 = pa.array([1, 2, 3], type='int64')
     types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
     logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8')
-    value_offsets = pa.array([1, 0, 0, 2, 1, 2, 3], type='int32')
-    py_value = [b'b', 1, b'a', b'c', 2, 3, b'd']
+    value_offsets = pa.array([0, 0, 1, 2, 1, 2, 3], type='int32')
+    py_value = [b'a', 1, b'b', b'c', 2, 3, b'd']
 
     def check_result(result, expected_field_names, expected_type_codes,
                      expected_type_code_values):
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 673c1387c47..160375f93bd 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -129,11 +129,16 @@ def test_get_function_vector():
     _check_get_function("unique", pc.VectorFunction, pc.VectorKernel, 8)
 
 
-def test_get_function_aggregate():
+def test_get_function_scalar_aggregate():
     _check_get_function("mean", pc.ScalarAggregateFunction,
                         pc.ScalarAggregateKernel, 8)
 
 
+def test_get_function_hash_aggregate():
+    _check_get_function("hash_sum", pc.HashAggregateFunction,
+                        pc.HashAggregateKernel, 1)
+
+
 def test_call_function_with_memory_pool():
     arr = pa.array(["foo", "bar", "baz"])
     indices = np.array([2, 2, 1])
@@ -574,6 +579,18 @@ def test_string_py_compat_boolean(function_name, variant):
             assert arrow_func(ar)[0].as_py() == getattr(c, py_name)()
 
 
+def test_replace_plain():
+    ar = pa.array(['foo', 'food', None])
+    ar = pc.replace_substring(ar, pattern='foo', replacement='bar')
+    assert ar.tolist() == ['bar', 'bard', None]
+
+
+def test_replace_regex():
+    ar = pa.array(['foo', 'mood', None])
+    ar = pc.replace_substring_regex(ar, pattern='(.)oo', replacement=r'\100')
+    assert ar.tolist() == ['f00', 'm00d', None]
+
+
 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
 def test_take(ty, values):
     arr = pa.array(values, type=ty)
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index 30fd806ce78..b852981ba39 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -30,7 +30,7 @@
 
 
 setup_template = """if 1:
-    from distutils.core import setup
+    from setuptools import setup
     from Cython.Build import cythonize
 
     import numpy as np
@@ -60,6 +60,20 @@
 """
 
 
+def check_cython_example_module(mod):
+    arr = pa.array([1, 2, 3])
+    assert mod.get_array_length(arr) == 3
+    with pytest.raises(TypeError, match="not an array"):
+        mod.get_array_length(None)
+
+    scal = pa.scalar(123)
+    cast_scal = mod.cast_scalar(scal, pa.utf8())
+    assert cast_scal == pa.scalar("123")
+    with pytest.raises(NotImplementedError,
+                       match="casting scalars of type int64 to type list"):
+        mod.cast_scalar(scal, pa.list_(pa.int64()))
+
+
 @pytest.mark.cython
 def test_cython_api(tmpdir):
     """
@@ -100,10 +114,7 @@ def test_cython_api(tmpdir):
         sys.path.insert(0, str(tmpdir))
         try:
             mod = __import__('pyarrow_cython_example')
-            arr = pa.array([1, 2, 3])
-            assert mod.get_array_length(arr) == 3
-            with pytest.raises(TypeError, match="not an array"):
-                mod.get_array_length(None)
+            check_cython_example_module(mod)
         finally:
             sys.path = orig_path
 
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index e34700838df..36cff9958f9 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -21,6 +21,7 @@
 import pathlib
 import pickle
 import textwrap
+import threading
 
 import numpy as np
 import pytest
@@ -329,6 +330,20 @@ def test_dataset(dataset):
     assert sorted(result['key']) == ['xxx', 'yyy']
 
 
+def test_dataset_execute_iterator(dataset):
+    # ARROW-11596: this would segfault due to Cython raising
+    # StopIteration without holding the GIL. (Fixed on Cython master,
+    # post 3.0a6)
+    tasks = dataset.scan()
+    task = next(tasks)
+    iterator = task.execute()
+    thread = threading.Thread(target=lambda: next(iterator))
+    thread.start()
+    thread.join()
+    with pytest.raises(StopIteration):
+        next(iterator)
+
+
 def test_scanner(dataset):
     scanner = ds.Scanner.from_dataset(dataset,
                                       memory_pool=pa.default_memory_pool())
@@ -526,6 +541,10 @@ def test_file_format_pickling():
         ds.CsvFileFormat(),
         ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t',
                                              ignore_empty_lines=True)),
+        ds.CsvFileFormat(read_options=pa.csv.ReadOptions(
+            skip_rows=3, column_names=['foo'])),
+        ds.CsvFileFormat(read_options=pa.csv.ReadOptions(
+            skip_rows=3, block_size=2**20)),
         ds.ParquetFileFormat(),
         ds.ParquetFileFormat(
             read_options=ds.ParquetReadOptions(use_buffered_stream=True)
@@ -541,6 +560,18 @@ def test_file_format_pickling():
         assert pickle.loads(pickle.dumps(file_format)) == file_format
 
 
+def test_fragment_scan_options_pickling():
+    options = [
+        ds.CsvFragmentScanOptions(),
+        ds.CsvFragmentScanOptions(
+            convert_options=pa.csv.ConvertOptions(strings_can_be_null=True)),
+        ds.CsvFragmentScanOptions(
+            read_options=pa.csv.ReadOptions(block_size=2**16)),
+    ]
+    for option in options:
+        assert pickle.loads(pickle.dumps(option)) == option
+
+
 @pytest.mark.parametrize('paths_or_selector', [
     fs.FileSelector('subdir', recursive=True),
     [
@@ -2242,6 +2273,51 @@ def test_csv_format_compressed(tempdir, compression):
     assert result.equals(table)
 
 
+def test_csv_format_options(tempdir):
+    path = str(tempdir / 'test.csv')
+    with open(path, 'w') as sink:
+        sink.write('skipped\ncol0\nfoo\nbar\n')
+    dataset = ds.dataset(path, format='csv')
+    result = dataset.to_table()
+    assert result.equals(
+        pa.table({'skipped': pa.array(['col0', 'foo', 'bar'])}))
+
+    dataset = ds.dataset(path, format=ds.CsvFileFormat(
+        read_options=pa.csv.ReadOptions(skip_rows=1)))
+    result = dataset.to_table()
+    assert result.equals(pa.table({'col0': pa.array(['foo', 'bar'])}))
+
+    dataset = ds.dataset(path, format=ds.CsvFileFormat(
+        read_options=pa.csv.ReadOptions(column_names=['foo'])))
+    result = dataset.to_table()
+    assert result.equals(
+        pa.table({'foo': pa.array(['skipped', 'col0', 'foo', 'bar'])}))
+
+
+def test_csv_fragment_options(tempdir):
+    path = str(tempdir / 'test.csv')
+    with open(path, 'w') as sink:
+        sink.write('col0\nfoo\nspam\nMYNULL\n')
+    dataset = ds.dataset(path, format='csv')
+    convert_options = pyarrow.csv.ConvertOptions(null_values=['MYNULL'],
+                                                 strings_can_be_null=True)
+    options = ds.CsvFragmentScanOptions(
+        convert_options=convert_options,
+        read_options=pa.csv.ReadOptions(block_size=2**16))
+    result = dataset.to_table(fragment_scan_options=options)
+    assert result.equals(pa.table({'col0': pa.array(['foo', 'spam', None])}))
+
+    csv_format = ds.CsvFileFormat(convert_options=convert_options)
+    dataset = ds.dataset(path, format=csv_format)
+    result = dataset.to_table()
+    assert result.equals(pa.table({'col0': pa.array(['foo', 'spam', None])}))
+
+    options = ds.CsvFragmentScanOptions()
+    result = dataset.to_table(fragment_scan_options=options)
+    assert result.equals(
+        pa.table({'col0': pa.array(['foo', 'spam', 'MYNULL'])}))
+
+
 def test_feather_format(tempdir):
     from pyarrow.feather import write_feather
 
@@ -2486,6 +2562,45 @@ def test_dataset_project_only_partition_columns(tempdir):
     assert all_cols.column('part').equals(part_only.column('part'))
 
 
+@pytest.mark.parquet
+@pytest.mark.pandas
+def test_dataset_project_null_column(tempdir):
+    import pandas as pd
+    df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')})
+
+    f = tempdir / "test_dataset_project_null_column.parquet"
+    df.to_parquet(f, engine="pyarrow")
+
+    dataset = ds.dataset(f, format="parquet",
+                         schema=pa.schema([("col", pa.int64())]))
+    expected = pa.table({'col': pa.array([None, None, None], pa.int64())})
+    assert dataset.to_table().equals(expected)
+
+
+def test_dataset_project_columns(tempdir):
+    # basic column re-projection with expressions
+    from pyarrow import feather
+    table = pa.table({"A": [1, 2, 3], "B": [1., 2., 3.], "C": ["a", "b", "c"]})
+    feather.write_feather(table, tempdir / "data.feather")
+
+    dataset = ds.dataset(tempdir / "data.feather", format="feather")
+    result = dataset.to_table(columns={
+        'A_renamed': ds.field('A'),
+        'B_as_int': ds.field('B').cast("int32", safe=False),
+        'C_is_a': ds.field('C') == 'a'
+    })
+    expected = pa.table({
+        "A_renamed": [1, 2, 3],
+        "B_as_int": pa.array([1, 2, 3], type="int32"),
+        "C_is_a": [True, False, False],
+    })
+    assert result.equals(expected)
+
+    # raise proper error when not passing an expression
+    with pytest.raises(TypeError, match="Expected an Expression"):
+        dataset.to_table(columns={"A": "A"})
+
+
 @pytest.mark.parquet
 @pytest.mark.pandas
 def test_write_to_dataset_given_null_just_works(tempdir):
@@ -2532,21 +2647,6 @@ def test_legacy_write_to_dataset_drops_null(tempdir):
     assert actual == expected
 
 
-@pytest.mark.parquet
-@pytest.mark.pandas
-def test_dataset_project_null_column(tempdir):
-    import pandas as pd
-    df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')})
-
-    f = tempdir / "test_dataset_project_null_column.parquet"
-    df.to_parquet(f, engine="pyarrow")
-
-    dataset = ds.dataset(f, format="parquet",
-                         schema=pa.schema([("col", pa.int64())]))
-    expected = pa.table({'col': pa.array([None, None, None], pa.int64())})
-    assert dataset.to_table().equals(expected)
-
-
 def _check_dataset_roundtrip(dataset, base_dir, expected_files,
                              base_dir_path=None, partitioning=None):
     base_dir_path = base_dir_path or base_dir
diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py
index 3fef4f24479..6ce584e5105 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -208,6 +208,25 @@ def test_empty_rows(self):
         assert table.num_columns == 0
         assert table.num_rows == 2
 
+    def test_reconcile_accross_blocks(self):
+        # ARROW-12065: reconciling inferred types accross blocks
+        first_row = b'{                               }\n'
+        read_options = ReadOptions(block_size=len(first_row))
+        for next_rows, expected_pylist in [
+            (b'{"a": 0}', [None, 0]),
+            (b'{"a": []}', [None, []]),
+            (b'{"a": []}\n{"a": [[1]]}', [None, [], [[1]]]),
+            (b'{"a": {}}', [None, {}]),
+            (b'{"a": {}}\n{"a": {"b": {"c": 1}}}',
+             [None, {"b": None}, {"b": {"c": 1}}]),
+        ]:
+            table = self.read_bytes(first_row + next_rows,
+                                    read_options=read_options)
+            expected = {"a": expected_pylist}
+            assert table.to_pydict() == expected
+            # Check that the issue was exercised
+            assert table.column("a").num_chunks > 1
+
     def test_explicit_schema_with_unexpected_behaviour(self):
         # infer by default
         rows = (b'{"foo": "bar", "num": 0}\n'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 96c09615896..49d845805f4 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -23,6 +23,7 @@ S3method(Ops,array_expression)
 S3method(all,equal.ArrowObject)
 S3method(as.character,ArrowDatum)
 S3method(as.character,FileFormat)
+S3method(as.character,FragmentScanOptions)
 S3method(as.data.frame,ArrowTabular)
 S3method(as.data.frame,StructArray)
 S3method(as.data.frame,arrow_dplyr_query)
@@ -76,6 +77,8 @@ S3method(read_message,InputStream)
 S3method(read_message,MessageReader)
 S3method(read_message,default)
 S3method(row.names,ArrowTabular)
+S3method(sort,ArrowDatum)
+S3method(sort,Scalar)
 S3method(sum,ArrowDatum)
 S3method(tail,ArrowDatum)
 S3method(tail,ArrowTabular)
@@ -103,6 +106,7 @@ export(CompressedOutputStream)
 export(CompressionType)
 export(CsvConvertOptions)
 export(CsvFileFormat)
+export(CsvFragmentScanOptions)
 export(CsvParseOptions)
 export(CsvReadOptions)
 export(CsvTableReader)
@@ -126,6 +130,7 @@ export(FileSystemDatasetFactory)
 export(FileType)
 export(FixedSizeListArray)
 export(FixedSizeListType)
+export(FragmentScanOptions)
 export(HivePartitioning)
 export(HivePartitioningFactory)
 export(InMemoryDataset)
@@ -279,25 +284,33 @@ importFrom(rlang,.data)
 importFrom(rlang,abort)
 importFrom(rlang,as_label)
 importFrom(rlang,dots_n)
+importFrom(rlang,enexpr)
+importFrom(rlang,enexprs)
 importFrom(rlang,enquo)
 importFrom(rlang,enquos)
 importFrom(rlang,env)
 importFrom(rlang,env_bind)
 importFrom(rlang,eval_tidy)
 importFrom(rlang,exec)
+importFrom(rlang,expr)
 importFrom(rlang,is_bare_character)
 importFrom(rlang,is_false)
 importFrom(rlang,is_integerish)
+importFrom(rlang,is_quosure)
 importFrom(rlang,list2)
 importFrom(rlang,new_data_mask)
 importFrom(rlang,new_environment)
+importFrom(rlang,quo_get_expr)
 importFrom(rlang,quo_is_null)
+importFrom(rlang,quo_set_expr)
 importFrom(rlang,quos)
+importFrom(rlang,seq2)
 importFrom(rlang,set_names)
 importFrom(rlang,syms)
 importFrom(rlang,warn)
 importFrom(tidyselect,contains)
 importFrom(tidyselect,ends_with)
+importFrom(tidyselect,eval_select)
 importFrom(tidyselect,everything)
 importFrom(tidyselect,last_col)
 importFrom(tidyselect,matches)
@@ -318,4 +331,5 @@ importFrom(vctrs,vec_cast)
 importFrom(vctrs,vec_ptype_abbr)
 importFrom(vctrs,vec_ptype_full)
 importFrom(vctrs,vec_size)
+importFrom(vctrs,vec_unique)
 useDynLib(arrow, .registration = TRUE)
diff --git a/r/R/array.R b/r/R/array.R
index aa164eaaf91..1d63c5735a7 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -73,6 +73,8 @@
 #'    (R vector or Array Array) `i`.
 #' - `$Filter(i, keep_na = TRUE)`: return an `Array` with values at positions where logical
 #'    vector (or Arrow boolean Array) `i` is `TRUE`.
+#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be
+#'    used to rearrange the `Array` in ascending or descending order
 #' - `$RangeEquals(other, start_idx, end_idx, other_start_idx)` :
 #' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the
 #'    data in the array to change its type.
@@ -131,6 +133,12 @@ Array <- R6Class("Array",
       assert_is(i, "Array")
       call_function("filter", self, i, options = list(keep_na = keep_na))
     },
+    SortIndices = function(descending = FALSE) {
+      assert_that(is.logical(descending))
+      assert_that(length(descending) == 1L)
+      assert_that(!is.na(descending))
+      call_function("array_sort_indices", self, options = list(order = descending))
+    },
     RangeEquals = function(other, start_idx, end_idx, other_start_idx = 0L) {
       assert_is(other, "Array")
       Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx)
diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R
index f4d9ad346aa..99940e74cbd 100644
--- a/r/R/arrow-datum.R
+++ b/r/R/arrow-datum.R
@@ -138,3 +138,23 @@ as.integer.ArrowDatum <- function(x, ...) as.integer(as.vector(x), ...)
 
 #' @export
 as.character.ArrowDatum <- function(x, ...) as.character(as.vector(x), ...)
+
+#' @export
+sort.ArrowDatum <- function(x, decreasing = FALSE, na.last = NA, ...) {
+  # Arrow always sorts nulls at the end of the array. This corresponds to
+  # sort(na.last = TRUE). For the other two cases (na.last = NA and
+  # na.last = FALSE) we need to use workarounds.
+  # TODO: Implement this more cleanly after ARROW-12063
+  if (is.na(na.last)) {
+    # Filter out NAs before sorting
+    x <- x$Filter(!is.na(x))
+    x$Take(x$SortIndices(descending = decreasing))
+  } else if (na.last) {
+    x$Take(x$SortIndices(descending = decreasing))
+  } else {
+    # Create a new array that encodes missing values as 1 and non-missing values
+    # as 0. Sort descending by that array first to get the NAs at the beginning
+    tbl <- Table$create(x = x, `is_na` = as.integer(is.na(x)))
+    tbl$x$Take(tbl$SortIndices(names = c("is_na", "x"), descending = c(TRUE, decreasing)))
+  }
+}
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index c1d76abfd71..2515e7ac920 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -18,20 +18,20 @@
 #' @importFrom R6 R6Class
 #' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl keep
 #' @importFrom assertthat assert_that is.string
-#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec is_bare_character
-#' @importFrom tidyselect vars_select
+#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec is_bare_character quo_get_expr quo_set_expr .data seq2 is_quosure enexpr enexprs expr
+#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
 #' @useDynLib arrow, .registration = TRUE
 #' @keywords internal
 "_PACKAGE"
 
-#' @importFrom vctrs s3_register vec_size vec_cast
+#' @importFrom vctrs s3_register vec_size vec_cast vec_unique
 .onLoad <- function(...) {
   dplyr_methods <- paste0(
     "dplyr::",
     c(
       "select", "filter", "collect", "summarise", "group_by", "groups",
       "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
-      "arrange", "rename", "pull"
+      "arrange", "rename", "pull", "relocate"
     )
   )
   for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {
@@ -117,11 +117,14 @@ arrow_info <- function() {
   if (out$libarrow) {
     pool <- default_memory_pool()
     runtimeinfo <- runtime_info()
+    compute_funcs <- list_compute_functions()
     out <- c(out, list(
       capabilities = c(
         dataset = arrow_with_dataset(),
         parquet = arrow_with_parquet(),
         s3 = arrow_with_s3(),
+        utf8proc = "utf8_upper" %in% compute_funcs,
+        re2 = "replace_substring_regex" %in% compute_funcs,
         vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1))
       ),
       memory_pool = list(
diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index a41586f26b3..157b799f3b6 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -38,6 +38,23 @@ ArrowTabular <- R6Class("ArrowTabular", inherit = ArrowObject,
       }
       assert_that(is.Array(i, "bool"))
       call_function("filter", self, i, options = list(keep_na = keep_na))
+    },
+    SortIndices = function(names, descending = FALSE) {
+      assert_that(is.character(names))
+      assert_that(length(names) > 0)
+      assert_that(!any(is.na(names)))
+      if (length(descending) == 1L) {
+        descending <- rep_len(descending, length(names))
+      }
+      assert_that(is.logical(descending))
+      assert_that(identical(length(names), length(descending)))
+      assert_that(!any(is.na(descending)))
+      call_function(
+        "sort_indices",
+        self,
+        # cpp11 does not support logical vectors so convert to integer
+        options = list(names = names, orders = as.integer(descending))
+      )
     }
   )
 )
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 0d0d3d30f8d..2c7bf5c19f6 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -284,6 +284,10 @@ compute__CallFunction <- function(func_name, args, options){
     .Call(`_arrow_compute__CallFunction`, func_name, args, options)
 }
 
+compute__GroupBy <- function(arguments, keys, options){
+    .Call(`_arrow_compute__GroupBy`, arguments, keys, options)
+}
+
 list_compute_functions <- function(){
     .Call(`_arrow_list_compute_functions`)
 }
@@ -428,8 +432,16 @@ dataset___IpcFileFormat__Make <- function(){
     .Call(`_arrow_dataset___IpcFileFormat__Make`)
 }
 
-dataset___CsvFileFormat__Make <- function(parse_options){
-    .Call(`_arrow_dataset___CsvFileFormat__Make`, parse_options)
+dataset___CsvFileFormat__Make <- function(parse_options, convert_options, read_options){
+    .Call(`_arrow_dataset___CsvFileFormat__Make`, parse_options, convert_options, read_options)
+}
+
+dataset___FragmentScanOptions__type_name <- function(fragment_scan_options){
+    .Call(`_arrow_dataset___FragmentScanOptions__type_name`, fragment_scan_options)
+}
+
+dataset___CsvFragmentScanOptions__Make <- function(convert_options, read_options){
+    .Call(`_arrow_dataset___CsvFragmentScanOptions__Make`, convert_options, read_options)
 }
 
 dataset___DirectoryPartitioning <- function(schm){
@@ -468,6 +480,10 @@ dataset___ScannerBuilder__BatchSize <- function(sb, batch_size){
     invisible(.Call(`_arrow_dataset___ScannerBuilder__BatchSize`, sb, batch_size))
 }
 
+dataset___ScannerBuilder__FragmentScanOptions <- function(sb, options){
+    invisible(.Call(`_arrow_dataset___ScannerBuilder__FragmentScanOptions`, sb, options))
+}
+
 dataset___ScannerBuilder__schema <- function(sb){
     .Call(`_arrow_dataset___ScannerBuilder__schema`, sb)
 }
@@ -1460,6 +1476,14 @@ Scalar__type <- function(s){
     .Call(`_arrow_Scalar__type`, s)
 }
 
+Scalar__Equals <- function(lhs, rhs){
+    .Call(`_arrow_Scalar__Equals`, lhs, rhs)
+}
+
+Scalar__ApproxEquals <- function(lhs, rhs){
+    .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs)
+}
+
 schema_ <- function(fields){
     .Call(`_arrow_schema_`, fields)
 }
diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R
index d639b235f3f..a7f9c8f790c 100644
--- a/r/R/chunked-array.R
+++ b/r/R/chunked-array.R
@@ -41,6 +41,8 @@
 #'    coerced to an R vector before taking.
 #' - `$Filter(i, keep_na = TRUE)`: return a `ChunkedArray` with values at positions where
 #'    logical vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
+#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be
+#'    used to rearrange the `ChunkedArray` in ascending or descending order
 #' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the
 #'    data in the array to change its type.
 #' - `$null_count()`: The number of null entries in the array
@@ -83,6 +85,18 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum,
       }
       call_function("filter", self, i, options = list(keep_na = keep_na))
     },
+    SortIndices = function(descending = FALSE) {
+      assert_that(is.logical(descending))
+      assert_that(length(descending) == 1L)
+      assert_that(!is.na(descending))
+      # TODO: after ARROW-12042 is closed, review whether this and the
+      # Array$SortIndices definition can be consolidated
+      call_function(
+        "sort_indices",
+        self,
+        options = list(names = "", orders = as.integer(descending))
+      )
+    },
     View = function(type) {
       ChunkedArray__View(self, as_type(type))
     },
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index f1bf601c720..cd54a300606 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -40,11 +40,18 @@
 #'   * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB.
 #'   * `dict_columns`: Names of columns which should be read as dictionaries.
 #'
-#'   `format = "text"`: see [CsvReadOptions]. Note that you can specify them either
+#'   `format = "text"`: see [CsvParseOptions]. Note that you can specify them either
 #'   with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
 #'   `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
 #'   Not all `readr` options are currently supported; please file an issue if
-#'   you encounter one that `arrow` should support.
+#'   you encounter one that `arrow` should support. Also, the following options are
+#'   supported. From [CsvReadOptions]:
+#'   * `skip_rows`
+#'   * `column_names`
+#'   * `autogenerate_column_names`
+#'   From [CsvFragmentScanOptions] (these values can be overridden at scan time):
+#'   * `convert_options`: a [CsvConvertOptions]
+#'   * `block_size`
 #'
 #' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
 #' @rdname FileFormat
@@ -101,13 +108,21 @@ IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)
 #' @rdname FileFormat
 #' @export
 CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
-CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...)) {
-  dataset___CsvFileFormat__Make(opts)
+CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...),
+                                 convert_options = csv_file_format_convert_options(...),
+                                 read_options = csv_file_format_read_options(...)) {
+  dataset___CsvFileFormat__Make(opts, convert_options, read_options)
 }
 
 # Support both readr-style option names and Arrow C++ option names
 csv_file_format_parse_options <- function(...) {
-  opt_names <- names(list(...))
+  opts <- list(...)
+  # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
+  convert_opts <- names(formals(CsvConvertOptions$create))
+  read_opts <- names(formals(CsvReadOptions$create))
+  opts[convert_opts] <- NULL
+  opts[read_opts] <- NULL
+  opt_names <- names(opts)
   # Catch any readr-style options specified with full option names that are
   # supported by read_delim_arrow() (and its wrappers) but are not yet
   # supported here
@@ -163,12 +178,89 @@ csv_file_format_parse_options <- function(...) {
       stop("Use either Arrow parse options or readr parse options, not both",
            call. = FALSE)
     }
-    readr_to_csv_parse_options(...) # all options have readr-style names
+    do.call(readr_to_csv_parse_options, opts) # all options have readr-style names
+  } else {
+    do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names
+  }
+}
+
+csv_file_format_convert_options <- function(...) {
+  opts <- list(...)
+  # Filter out arguments meant for CsvParseOptions/CsvReadOptions
+  arrow_opts <- names(formals(CsvParseOptions$create))
+  readr_opts <- names(formals(readr_to_csv_parse_options))
+  read_opts <- names(formals(CsvReadOptions$create))
+  opts[arrow_opts] <- NULL
+  opts[readr_opts] <- NULL
+  opts[read_opts] <- NULL
+  do.call(CsvConvertOptions$create, opts)
+}
+
+csv_file_format_read_options <- function(...) {
+  opts <- list(...)
+  # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
+  arrow_opts <- names(formals(CsvParseOptions$create))
+  readr_opts <- names(formals(readr_to_csv_parse_options))
+  convert_opts <- names(formals(CsvConvertOptions$create))
+  opts[arrow_opts] <- NULL
+  opts[readr_opts] <- NULL
+  opts[convert_opts] <- NULL
+  do.call(CsvReadOptions$create, opts)
+}
+
+#' Format-specific scan options
+#'
+#' @description
+#' A `FragmentScanOptions` holds options specific to a `FileFormat` and a scan
+#' operation.
+#'
+#' @section Factory:
+#' `FragmentScanOptions$create()` takes the following arguments:
+#' * `format`: A string identifier of the file format. Currently supported values:
+#'   * "csv"/"text", aliases for the same format.
+#' * `...`: Additional format-specific options
+#'
+#'   `format = "text"`: see [CsvConvertOptions]. Note that options can only be
+#'   specified with the Arrow C++ library naming. Also, "block_size" from
+#'   [CsvReadOptions] may be given.
+#'
+#' It returns the appropriate subclass of `FragmentScanOptions`
+#' (e.g. `CsvFragmentScanOptions`).
+#' @rdname FragmentScanOptions
+#' @name FragmentScanOptions
+#' @export
+FragmentScanOptions <- R6Class("FragmentScanOptions", inherit = ArrowObject,
+  active = list(
+    # @description
+    # Return the `FragmentScanOptions`'s type
+    type = function() dataset___FragmentScanOptions__type_name(self)
+  )
+)
+FragmentScanOptions$create <- function(format, ...) {
+  opt_names <- names(list(...))
+  if (format %in% c("csv", "text", "tsv")) {
+    CsvFragmentScanOptions$create(...)
   } else {
-    CsvParseOptions$create(...) # all options have Arrow C++ names
+    stop("Unsupported file format: ", format, call. = FALSE)
   }
 }
 
+#' @export
+as.character.FragmentScanOptions <- function(x, ...) {
+  x$type
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FragmentScanOptions
+#' @export
+CsvFragmentScanOptions <- R6Class("CsvFragmentScanOptions", inherit = FragmentScanOptions)
+CsvFragmentScanOptions$create <- function(...,
+                                          convert_opts = csv_file_format_convert_options(...),
+                                          read_opts = csv_file_format_read_options(...)) {
+  dataset___CsvFragmentScanOptions__Make(convert_opts, read_opts)
+}
+
 #' Format-specific write options
 #'
 #' @description
diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R
index 1c71bf481b5..f7ede663c7f 100644
--- a/r/R/dataset-scan.R
+++ b/r/R/dataset-scan.R
@@ -67,6 +67,7 @@ Scanner$create <- function(dataset,
                            filter = TRUE,
                            use_threads = option_use_threads(),
                            batch_size = NULL,
+                           fragment_scan_options = NULL,
                            ...) {
   if (inherits(dataset, "arrow_dplyr_query")) {
     if (inherits(dataset$.data, "ArrowTabular")) {
@@ -75,9 +76,11 @@ Scanner$create <- function(dataset,
     }
     return(Scanner$create(
       dataset$.data,
-      dataset$selected_columns,
+      c(dataset$selected_columns, dataset$temp_columns),
       dataset$filtered_rows,
       use_threads,
+      batch_size,
+      fragment_scan_options,
       ...
     ))
   }
@@ -99,6 +102,9 @@ Scanner$create <- function(dataset,
   if (is_integerish(batch_size)) {
     scanner_builder$BatchSize(batch_size)
   }
+  if (!is.null(fragment_scan_options)) {
+    scanner_builder$FragmentScanOptions(fragment_scan_options)
+  }
   scanner_builder$Finish()
 }
 
@@ -142,7 +148,8 @@ map_batches <- function(X, FUN, ..., .data.frame = TRUE) {
     lapply(scan_task$Execute(), function(batch) {
       # message("Processing Batch")
       # This inner lapply cannot be parallelized
-      # TODO: wrap batch in arrow_dplyr_query with X$selected_columns and X$group_by_vars
+      # TODO: wrap batch in arrow_dplyr_query with X$selected_columns,
+      # X$temp_columns, and X$group_by_vars
       # if X is arrow_dplyr_query, if some other arg (.dplyr?) == TRUE
       FUN(batch, ...)
     })
@@ -185,6 +192,10 @@ ScannerBuilder <- R6Class("ScannerBuilder", inherit = ArrowObject,
       dataset___ScannerBuilder__BatchSize(self, batch_size)
       self
     },
+    FragmentScanOptions = function(options) {
+      dataset___ScannerBuilder__FragmentScanOptions(self, options)
+      self
+    },
     Finish = function() dataset___ScannerBuilder__Finish(self)
   ),
   active = list(
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index e6fdba4d4d4..f14183c26bb 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -46,7 +46,13 @@ arrow_dplyr_query <- function(.data) {
       # drop_empty_groups is a logical value indicating whether to drop
       # groups formed by factor levels that don't appear in the data. It
       # should be non-null only when the data is grouped.
-      drop_empty_groups = NULL
+      drop_empty_groups = NULL,
+      # arrange_vars will be a list of expressions named by their associated
+      # column names
+      arrange_vars = list(),
+      # arrange_desc will be a logical vector indicating the sort order for each
+      # expression in arrange_vars (FALSE for ascending, TRUE for descending)
+      arrange_desc = logical()
     ),
     class = "arrow_dplyr_query"
   )
@@ -80,6 +86,25 @@ print.arrow_dplyr_query <- function(x, ...) {
   if (length(x$group_by_vars)) {
     cat("* Grouped by ", paste(x$group_by_vars, collapse = ", "), "\n", sep = "")
   }
+  if (length(x$arrange_vars)) {
+    if (query_on_dataset(x)) {
+      arrange_strings <- map_chr(x$arrange_vars, function(x) x$ToString())
+    } else {
+      arrange_strings <- map_chr(x$arrange_vars, .format_array_expression)
+    }
+    cat(
+      "* Sorted by ",
+      paste(
+        paste0(
+          arrange_strings,
+          " [", ifelse(x$arrange_desc, "desc", "asc"), "]"
+        ),
+        collapse = ", "
+      ),
+      "\n",
+      sep = ""
+    )
+  }
   cat("See $.data for the source Arrow object\n")
   invisible(x)
 }
@@ -183,12 +208,13 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) {
 tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)
 
 select.arrow_dplyr_query <- function(.data, ...) {
+  check_select_helpers(enexprs(...))
   column_select(arrow_dplyr_query(.data), !!!enquos(...))
 }
 select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query
 
-#' @importFrom tidyselect vars_rename
 rename.arrow_dplyr_query <- function(.data, ...) {
+  check_select_helpers(enexprs(...))
   column_select(arrow_dplyr_query(.data), !!!enquos(...), .FUN = vars_rename)
 }
 rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query
@@ -216,6 +242,70 @@ column_select <- function(.data, ..., .FUN = vars_select) {
   .data
 }
 
+relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
+  # The code in this function is adapted from the code in dplyr::relocate.data.frame
+  # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
+  # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829
+  check_select_helpers(c(enexprs(...), enexpr(.before), enexpr(.after)))
+
+  .data <- arrow_dplyr_query(.data)
+
+  to_move <- eval_select(expr(c(...)), .data$selected_columns)
+
+  .before <- enquo(.before)
+  .after <- enquo(.after)
+  has_before <- !quo_is_null(.before)
+  has_after <- !quo_is_null(.after)
+
+  if (has_before && has_after) {
+    abort("Must supply only one of `.before` and `.after`.")
+  } else if (has_before) {
+    where <- min(unname(eval_select(.before, .data$selected_columns)))
+    if (!where %in% to_move) {
+      to_move <- c(to_move, where)
+    }
+  } else if (has_after) {
+    where <- max(unname(eval_select(.after, .data$selected_columns)))
+    if (!where %in% to_move) {
+      to_move <- c(where, to_move)
+    }
+  } else {
+    where <- 1L
+    if (!where %in% to_move) {
+      to_move <- c(to_move, where)
+    }
+  }
+
+  lhs <- setdiff(seq2(1, where - 1), to_move)
+  rhs <- setdiff(seq2(where + 1, length(.data$selected_columns)), to_move)
+
+  pos <- vec_unique(c(lhs, to_move, rhs))
+  new_names <- names(pos)
+  .data$selected_columns <- .data$selected_columns[pos]
+
+  if (!is.null(new_names)) {
+    names(.data$selected_columns)[new_names != ""] <- new_names[new_names != ""]
+  }
+  .data
+}
+relocate.Dataset <- relocate.ArrowTabular <- relocate.arrow_dplyr_query
+
+check_select_helpers <- function(exprs) {
+  # Throw an error if unsupported tidyselect selection helpers in `exprs`
+  exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x)
+  unsup_select_helpers <- "where"
+  funs_in_exprs <- unlist(lapply(exprs, all_funs))
+  unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers]
+  if (length(unsup_funs)) {
+    stop(
+      "Unsupported selection ",
+      ngettext(length(unsup_funs), "helper: ", "helpers: "),
+      oxford_paste(paste0(unsup_funs, "()"), quote = FALSE),
+      call. = FALSE
+    )
+  }
+}
+
 filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) {
   # TODO something with the .preserve argument
   filts <- quos(...)
@@ -285,8 +375,8 @@ i18ize_error_messages <- function() {
   # Figure out what the error messages will be with this LANGUAGE
   # so that we can look for them
   out <- list(
-    obj = tryCatch(X_____X, error = function(e) conditionMessage(e)),
-    fun = tryCatch(X_____X(), error = function(e) conditionMessage(e))
+    obj = tryCatch(eval(parse(text = "X_____X")), error = function(e) conditionMessage(e)),
+    fun = tryCatch(eval(parse(text = "X_____X()")), error = function(e) conditionMessage(e))
   )
   paste(map(out, ~sub("X_____X", ".*", .)), collapse = "|")
 }
@@ -306,6 +396,23 @@ build_function_list <- function(FUN) {
     # Include mappings from R function name spellings
     lapply(set_names(names(.array_function_map)), wrapper),
     # Plus some special handling where it's not 1:1
+    nchar = function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
+      if (allowNA) {
+        stop("allowNA = TRUE not supported for Arrow", call. = FALSE)
+      }
+      if (is.na(keepNA)) {
+        keepNA <- !identical(type, "width")
+      }
+      if (!keepNA) {
+        # TODO: I think there is a fill_null kernel we could use, set null to 2
+        stop("keepNA = TRUE not supported for Arrow", call. = FALSE)
+      }
+      if (identical(type, "bytes")) {
+        FUN("binary_length", x)
+      } else {
+        FUN("utf8_length", x)
+      }
+    },
     str_trim = function(string, side = c("both", "left", "right")) {
       side <- match.arg(side)
       switch(
@@ -378,6 +485,7 @@ set_filters <- function(.data, expressions) {
 
 collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
   x <- ensure_group_vars(x)
+  x <- ensure_arrange_vars(x) # this sets x$temp_columns
   # Pull only the selected rows and cols into R
   if (query_on_dataset(x)) {
     # See dataset.R for Dataset and Scanner(Builder) classes
@@ -391,10 +499,14 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
     } else {
       filter <- eval_array_expression(x$filtered_rows, x$.data)
     }
-    # TODO: shortcut if identical(names(x$.data), find_array_refs(x$selected_columns))?
-    tab <- x$.data[filter, find_array_refs(x$selected_columns), keep_na = FALSE]
+    # TODO: shortcut if identical(names(x$.data), find_array_refs(c(x$selected_columns, x$temp_columns)))?
+    tab <- x$.data[
+      filter,
+      find_array_refs(c(x$selected_columns, x$temp_columns)),
+      keep_na = FALSE
+    ]
     # Now evaluate those expressions on the filtered table
-    cols <- lapply(x$selected_columns, eval_array_expression, data = tab)
+    cols <- lapply(c(x$selected_columns, x$temp_columns), eval_array_expression, data = tab)
     if (length(cols) == 0) {
       tab <- tab[, integer(0)]
     } else {
@@ -405,6 +517,14 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
       }
     }
   }
+  # Arrange rows
+  if (length(x$arrange_vars) > 0) {
+    tab <- tab[
+      tab$SortIndices(names(x$arrange_vars), x$arrange_desc),
+      names(x$selected_columns), # this omits x$temp_columns from the result
+      drop = FALSE
+    ]
+  }
   if (as_data_frame) {
     df <- as.data.frame(tab)
     tab$invalidate()
@@ -416,7 +536,6 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
 collect.ArrowTabular <- as.data.frame.ArrowTabular
 collect.Dataset <- function(x, ...) dplyr::collect(arrow_dplyr_query(x), ...)
 
-#' @importFrom rlang .data
 ensure_group_vars <- function(x) {
   if (inherits(x, "arrow_dplyr_query")) {
     # Before pulling data from Arrow, make sure all group vars are in the projection
@@ -432,6 +551,20 @@ ensure_group_vars <- function(x) {
   x
 }
 
+ensure_arrange_vars <- function(x) {
+  # The arrange() operation is not performed until later, because:
+  # - It must be performed after mutate(), to enable sorting by new columns.
+  # - It should be performed after filter() and select(), for efficiency.
+  # However, we need users to be able to arrange() by columns and expressions
+  # that are *not* returned in the query result. To enable this, we must
+  # *temporarily* include these columns and expressions in the projection. We
+  # use x$temp_columns to store these. Later, after the arrange() operation has
+  # been performed, these are omitted from the result. This differs from the
+  # columns in x$group_by_vars which *are* returned in the result.
+  x$temp_columns <- x$arrange_vars[!names(x$arrange_vars) %in% names(x$selected_columns)]
+  x
+}
+
 restore_dplyr_features <- function(df, query) {
   # An arrow_dplyr_query holds some attributes that Arrow doesn't know about
   # After calling collect(), make sure these features are carried over
@@ -460,7 +593,6 @@ restore_dplyr_features <- function(df, query) {
   df
 }
 
-#' @importFrom tidyselect vars_pull
 pull.arrow_dplyr_query <- function(.data, var = -1) {
   .data <- arrow_dplyr_query(.data)
   var <- vars_pull(names(.data), !!enquo(var))
@@ -470,21 +602,65 @@ pull.arrow_dplyr_query <- function(.data, var = -1) {
 pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query
 
 summarise.arrow_dplyr_query <- function(.data, ...) {
+  call <- match.call()
   .data <- arrow_dplyr_query(.data)
   if (query_on_dataset(.data)) {
     not_implemented_for_dataset("summarize()")
   }
+  exprs <- quos(...)
   # Only retain the columns we need to do our aggregations
   vars_to_keep <- unique(c(
-    unlist(lapply(quos(...), all.vars)), # vars referenced in summarise
+    unlist(lapply(exprs, all.vars)), # vars referenced in summarise
     dplyr::group_vars(.data)             # vars needed for grouping
   ))
   .data <- dplyr::select(.data, vars_to_keep)
-  # TODO: determine whether work can be pushed down to Arrow
-  dplyr::summarise(dplyr::collect(.data), ...)
+  if (isTRUE(getOption("arrow.summarize", FALSE))) {
+    # Try stuff, if successful return()
+    out <- try(do_arrow_group_by(.data, ...), silent = TRUE)
+    if (inherits(out, "try-error")) {
+      return(abandon_ship(call, .data, format(out)))
+    } else {
+      return(out)
+    }
+  } else {
+    # If unsuccessful or if option not set, do the work in R
+    dplyr::summarise(dplyr::collect(.data), ...)
+  }
 }
 summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query
 
+do_arrow_group_by <- function(.data, ...) {
+  exprs <- quos(...)
+  mask <- arrow_mask(.data)
+  # Add aggregation wrappers to arrow_mask somehow
+  # (this is not ideal, would overwrite same-named objects)
+  mask$sum <- function(x, na.rm = FALSE) {
+    list(
+      fun = "sum",
+      data = x,
+      options = list(na.rm = na.rm)
+    )
+  }
+  results <- list()
+  for (i in seq_along(exprs)) {
+    # Iterate over the indices and not the names because names may be repeated
+    # (which overwrites the previous name)
+    new_var <- names(exprs)[i]
+    results[[new_var]] <- arrow_eval(exprs[[i]], mask)
+    if (inherits(results[[new_var]], "try-error")) {
+      msg <- paste('Expression', as_label(exprs[[i]]), 'not supported in Arrow')
+      stop(msg, call. = FALSE)
+    }
+    # Put it in the data mask too?
+    #mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]]
+  }
+  # Now, from that, split out the array (expressions) and options
+  opts <- lapply(results, function(x) x[c("fun", "options")])
+  inputs <- lapply(results, function(x) eval_array_expression(x$data, .data$.data))
+  grouping_vars <- lapply(.data$group_by_vars, function(x) eval_array_expression(.data$selected_columns[[x]], .data$.data))
+  compute__GroupBy(inputs, grouping_vars, opts)
+}
+
 group_by.arrow_dplyr_query <- function(.data,
                                        ...,
                                        .add = FALSE,
@@ -496,10 +672,8 @@ group_by.arrow_dplyr_query <- function(.data,
   new_groups <- enquos(...)
   new_groups <- new_groups[nzchar(names(new_groups))]
   if (length(new_groups)) {
-    # TODO(ARROW-11658): either find a way to let group_by_prepare handle this
-    # (it may call mutate() for us)
-    # or essentially reimplement it here (see dplyr:::add_computed_columns)
-    stop("Cannot create or rename columns in group_by on Arrow objects", call. = FALSE)
+    # Add them to the data
+    .data <- dplyr::mutate(.data, !!!new_groups)
   }
   if (".add" %in% names(formals(dplyr::group_by))) {
     # dplyr >= 1.0
@@ -553,10 +727,7 @@ mutate.arrow_dplyr_query <- function(.data,
   .data <- arrow_dplyr_query(.data)
 
   # Restrict the cases we support for now
-  if (!quo_is_null(.before) || !quo_is_null(.after)) {
-    # TODO(ARROW-11701)
-    return(abandon_ship(call, .data, '.before and .after arguments are not supported in Arrow'))
-  } else if (length(dplyr::group_vars(.data)) > 0) {
+  if (length(dplyr::group_vars(.data)) > 0) {
     # mutate() on a grouped dataset does calculations within groups
     # This doesn't matter on scalar ops (arithmetic etc.) but it does
     # for things with aggregations (e.g. subtracting the mean)
@@ -593,25 +764,36 @@ mutate.arrow_dplyr_query <- function(.data,
     mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]]
   }
 
-  # Assign the new columns into the .data$selected_columns, respecting the .keep param
+  old_vars <- names(.data$selected_columns)
+  # Note that this is names(exprs) not names(results):
+  # if results$new_var is NULL, that means we are supposed to remove it
+  new_vars <- names(exprs)
+
+  # Assign the new columns into the .data$selected_columns
+  for (new_var in new_vars) {
+    .data$selected_columns[[new_var]] <- results[[new_var]]
+  }
+
+  # Deduplicate new_vars and remove NULL columns from new_vars
+  new_vars <- intersect(new_vars, names(.data$selected_columns))
+
+  # Respect .before and .after
+  if (!quo_is_null(.before) || !quo_is_null(.after)) {
+    new <- setdiff(new_vars, old_vars)
+    .data <- dplyr::relocate(.data, !!new, .before = !!.before, .after = !!.after)
+  }
+
+  # Respect .keep
   if (.keep == "none") {
-    .data$selected_columns <- results
-  } else {
-    if (.keep != "all") {
-      # "used" or "unused"
-      used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE)
-      old_vars <- names(.data$selected_columns)
-      if (.keep == "used") {
-        .data$selected_columns <- .data$selected_columns[intersect(old_vars, used_vars)]
-      } else {
-        # "unused"
-        .data$selected_columns <- .data$selected_columns[setdiff(old_vars, used_vars)]
-      }
-    }
-    # Note that this is names(exprs) not names(results):
-    # if results$new_var is NULL, that means we are supposed to remove it
-    for (new_var in names(exprs)) {
-      .data$selected_columns[[new_var]] <- results[[new_var]]
+    .data$selected_columns <- .data$selected_columns[new_vars]
+  } else if (.keep != "all") {
+    # "used" or "unused"
+    used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE)
+    if (.keep == "used") {
+      .data$selected_columns[setdiff(old_vars, used_vars)] <- NULL
+    } else {
+      # "unused"
+      .data$selected_columns[intersect(old_vars, used_vars)] <- NULL
     }
   }
   # Even if "none", we still keep group vars
@@ -645,17 +827,80 @@ abandon_ship <- function(call, .data, msg = NULL) {
   eval.parent(call, 2)
 }
 
-arrange.arrow_dplyr_query <- function(.data, ...) {
+arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) {
+  call <- match.call()
+  exprs <- quos(...)
+  if (.by_group) {
+    # when the data is is grouped and .by_group is TRUE, order the result by
+    # the grouping columns first
+    exprs <- c(quos(!!!dplyr::groups(.data)), exprs)
+  }
+  if (length(exprs) == 0) {
+    # Nothing to do
+    return(.data)
+  }
   .data <- arrow_dplyr_query(.data)
-  if (query_on_dataset(.data)) {
-    not_implemented_for_dataset("arrange()")
+  # find and remove any dplyr::desc() and tidy-eval
+  # the arrange expressions inside an Arrow data_mask
+  sorts <- vector("list", length(exprs))
+  descs <- logical(0)
+  mask <- arrow_mask(.data)
+  for (i in seq_along(exprs)) {
+    x <- find_and_remove_desc(exprs[[i]])
+    exprs[[i]] <- x[["quos"]]
+    sorts[[i]] <- arrow_eval(exprs[[i]], mask)
+    if (inherits(sorts[[i]], "try-error")) {
+      msg <- paste('Expression', as_label(exprs[[i]]), 'not supported in Arrow')
+      return(abandon_ship(call, .data, msg))
+    }
+    names(sorts)[i] <- as_label(exprs[[i]])
+    descs[i] <- x[["desc"]]
   }
-  # TODO(ARROW-11703) move this to Arrow
-  call <- match.call()
-  abandon_ship(call, .data)
+  .data$arrange_vars <- c(sorts, .data$arrange_vars)
+  .data$arrange_desc <- c(descs, .data$arrange_desc)
+  .data
 }
 arrange.Dataset <- arrange.ArrowTabular <- arrange.arrow_dplyr_query
 
+# Helper to handle desc() in arrange()
+# * Takes a quosure as input
+# * Returns a list with two elements:
+#   1. The quosure with any wrapping parentheses and desc() removed
+#   2. A logical value indicating whether desc() was found
+# * Performs some other validation
+find_and_remove_desc <- function(quosure) {
+  expr <- quo_get_expr(quosure)
+  descending <- FALSE
+  if (length(all.vars(expr)) < 1L) {
+    stop(
+      "Expression in arrange() does not contain any field names: ",
+      deparse(expr),
+      call. = FALSE
+    )
+  }
+  # Use a while loop to remove any number of nested pairs of enclosing
+  # parentheses and any number of nested desc() calls. In the case of multiple
+  # nested desc() calls, each one toggles the sort order.
+  while (identical(typeof(expr), "language") && is.call(expr)) {
+    if (identical(expr[[1]], quote(`(`))) {
+      # remove enclosing parentheses
+      expr <- expr[[2]]
+    } else if (identical(expr[[1]], quote(desc))) {
+      # remove desc() and toggle descending
+      expr <- expr[[2]]
+      descending <- !descending
+    } else {
+      break
+    }
+  }
+  return(
+    list(
+      quos = quo_set_expr(quosure, expr),
+      desc = descending
+    )
+  )
+}
+
 query_on_dataset <- function(x) inherits(x$.data, "Dataset")
 
 not_implemented_for_dataset <- function(method) {
diff --git a/r/R/expression.R b/r/R/expression.R
index 3f79c92dd46..ed81418f41c 100644
--- a/r/R/expression.R
+++ b/r/R/expression.R
@@ -119,13 +119,15 @@ cast_array_expression <- function(x, to_type, safe = TRUE, ...) {
   "!" = "invert",
   "is.na" = "is_null",
   "is.nan" = "is_nan",
-  "nchar" = "binary_length",
+  # nchar is defined in dplyr.R because it is more complex
+  # "nchar" = "utf8_length",
   "tolower" = "utf8_lower",
   "toupper" = "utf8_upper",
   # stringr spellings of those
-  "str_length" = "binary_length",
+  "str_length" = "utf8_length",
   "str_to_lower" = "utf8_lower",
   "str_to_upper" = "utf8_upper"
+  # str_trim is defined in dplyr.R
 )
 
 .binary_function_map <- list(
diff --git a/r/R/feather.R b/r/R/feather.R
index 94d0fc2eb27..637ce23234a 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -175,11 +175,9 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
 #' @section Factory:
 #'
 #' The `FeatherReader$create()` factory method instantiates the object and
-#' takes the following arguments:
+#' takes the following argument:
 #'
 #' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
-#' - `mmap` Logical: whether to memory-map the file (default `TRUE`)
-#' - `...` Additional arguments, currently ignored
 #'
 #' @section Methods:
 #'
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index fdb54deb881..8e7382a70b4 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -57,6 +57,11 @@
 #'    integers (R vector or Array Array) `i`.
 #' - `$Filter(i, keep_na = TRUE)`: return an `RecordBatch` with rows at positions where logical
 #'    vector (or Arrow boolean Array) `i` is `TRUE`.
+#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row
+#'    positions that can be used to rearrange the `RecordBatch` in ascending or
+#'    descending order by the first named column, breaking ties with further named
+#'    columns. `descending` can be a logical vector of length one or of the same
+#'    length as `names`.
 #' - `$serialize()`: Returns a raw vector suitable for interprocess communication
 #' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
 #'    the schema of the record batch.
@@ -99,7 +104,7 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowTabular,
         RecordBatch__Slice2(self, offset, length)
       }
     },
-    # Take and Filter are methods on ArrowTabular
+    # Take, Filter, and SortIndices are methods on ArrowTabular
     serialize = function() ipc___SerializeRecordBatch__Raw(self),
     to_data_frame = function() {
       RecordBatch__to_dataframe(self, use_threads = option_use_threads())
diff --git a/r/R/scalar.R b/r/R/scalar.R
index d6955423b53..cbda5964a2c 100644
--- a/r/R/scalar.R
+++ b/r/R/scalar.R
@@ -33,7 +33,13 @@ Scalar <- R6Class("Scalar",
   public = list(
     ToString = function() Scalar__ToString(self),
     as_vector = function() Scalar__as_vector(self),
-    as_array = function() MakeArrayFromScalar(self)
+    as_array = function() MakeArrayFromScalar(self),
+    Equals = function(other, ...) {
+      inherits(other, "Scalar") && Scalar__Equals(self, other)
+    },
+    ApproxEquals = function(other, ...) {
+      inherits(other, "Scalar") && Scalar__ApproxEquals(self, other)
+    }
   ),
   active = list(
     is_valid = function() Scalar__is_valid(self),
@@ -68,3 +74,6 @@ length.Scalar <- function(x) 1L
 
 #' @export
 is.na.Scalar <- function(x) !x$is_valid
+
+#' @export
+sort.Scalar <- function(x, decreasing = FALSE, ...) x
diff --git a/r/R/table.R b/r/R/table.R
index d2c9960e6d2..fdf3f5cc20d 100644
--- a/r/R/table.R
+++ b/r/R/table.R
@@ -65,6 +65,11 @@
 #'    coerced to an R vector before taking.
 #' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical
 #'    vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
+#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row
+#'    positions that can be used to rearrange the `Table` in ascending or descending
+#'    order by the first named column, breaking ties with further named columns.
+#'    `descending` can be a logical vector of length one or of the same length as
+#'    `names`.
 #' - `$serialize(output_stream, ...)`: Write the table to the given
 #'    [OutputStream]
 #' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
@@ -122,7 +127,7 @@ Table <- R6Class("Table", inherit = ArrowTabular,
         Table__Slice2(self, offset, length)
       }
     },
-    # Take and Filter are methods on ArrowTabular
+    # Take, Filter, and SortIndices are methods on ArrowTabular
     Equals = function(other, check_metadata = FALSE, ...) {
       inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata))
     },
diff --git a/r/R/util.R b/r/R/util.R
index 3362c0f4fda..4680381e909 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -15,6 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# for compatibility with R versions earlier than 4.0.0
+if (!exists("deparse1")) {
+  deparse1 <- function (expr, collapse = " ", width.cutoff = 500L, ...) {
+    paste(deparse(expr, width.cutoff, ...), collapse = collapse)
+  }
+}
+
 oxford_paste <- function(x, conjunction = "and", quote = TRUE) {
   if (quote && is.character(x)) {
     x <- paste0('"', x, '"')
@@ -45,3 +52,37 @@ is_list_of <- function(object, class) {
 }
 
 empty_named_list <- function() structure(list(), .Names = character(0))
+
+r_symbolic_constants <- c(
+  "pi", "TRUE", "FALSE", "NULL", "Inf", "NA", "NaN",
+  "NA_integer_", "NA_real_", "NA_complex_", "NA_character_"
+)
+
+is_function <- function(expr, name) {
+  if (!is.call(expr)) {
+    return(FALSE)
+  } else {
+    if (deparse1(expr[[1]]) == name) {
+      return(TRUE)
+    }
+    out <- lapply(expr, is_function, name)
+  }
+  any(vapply(out, isTRUE, TRUE))
+}
+
+all_funs <- function(expr) {
+  names <- all_names(expr)
+  names[vapply(names, function(name) {is_function(expr, name)}, TRUE)]
+}
+
+all_vars <- function(expr) {
+  setdiff(all.vars(expr), r_symbolic_constants)
+}
+
+all_names <- function(expr) {
+  setdiff(all.names(expr), r_symbolic_constants)
+}
+
+is_constant <- function(expr) {
+  length(all_vars(expr)) == 0
+}
diff --git a/r/README.md b/r/README.md
index e6f5e79cca8..c103000f5f6 100644
--- a/r/README.md
+++ b/r/README.md
@@ -143,7 +143,13 @@ checkout:
 
 ``` shell
 cd ../../r
-R -e 'install.packages(c("devtools", "roxygen2", "pkgdown", "covr")); devtools::install_dev_deps()'
+
+Rscript -e '
+options(repos = "https://cloud.r-project.org/")
+if (!require("remotes")) install.packages("remotes")
+remotes::install_deps(dependencies = TRUE)
+'
+
 R CMD INSTALL .
 ```
 
diff --git a/r/configure.win b/r/configure.win
index 88ac0e125e1..d645834fac8 100644
--- a/r/configure.win
+++ b/r/configure.win
@@ -50,13 +50,13 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man
 # NOTE: If you make changes to the libraries below, you should also change
 # ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD
 PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET"
-PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lre2 -lthrift -lsnappy -lz -lzstd -llz4 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
+PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
 
-# S3 support only for Rtools40 (i.e. R >= 4.0)
+# S3 and re2 support only for Rtools40 (i.e. R >= 4.0)
 "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1
 if [ $? -eq 0 ]; then
   PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3"
-  PKG_LIBS="${PKG_LIBS} ${AWS_LIBS}"
+  PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS}"
 else
   # It seems that order matters
   PKG_LIBS="${PKG_LIBS} -lws2_32"
diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh
index 00054911492..3a63d7541e9 100755
--- a/r/inst/build_arrow_static.sh
+++ b/r/inst/build_arrow_static.sh
@@ -61,7 +61,9 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \
     -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \
     -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-$ARROW_DEFAULT_PARAM} \
     -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-$ARROW_DEFAULT_PARAM} \
+    -DARROW_WITH_RE2=${ARROW_WITH_RE2:-ON} \
     -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-$ARROW_DEFAULT_PARAM} \
+    -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \
     -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-$ARROW_DEFAULT_PARAM} \
     -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-$ARROW_DEFAULT_PARAM} \
     -DCMAKE_BUILD_TYPE=Release \
diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray.Rd
index 533931ae972..90dd2e39e40 100644
--- a/r/man/ChunkedArray.Rd
+++ b/r/man/ChunkedArray.Rd
@@ -38,6 +38,8 @@ integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray},
 coerced to an R vector before taking.
 \item \verb{$Filter(i, keep_na = TRUE)}: return a \code{ChunkedArray} with values at positions where
 logical vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(descending = FALSE)}: return an \code{Array} of integer positions that can be
+used to rearrange the \code{ChunkedArray} in ascending or descending order
 \item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the
 data in the array to change its type.
 \item \verb{$null_count()}: The number of null entries in the array
diff --git a/r/man/FeatherReader.Rd b/r/man/FeatherReader.Rd
index 224006c08c5..47e06e75ab1 100644
--- a/r/man/FeatherReader.Rd
+++ b/r/man/FeatherReader.Rd
@@ -13,11 +13,9 @@ make an \code{arrow::Table}. See its usage in \code{\link[=read_feather]{read_fe
 
 
 The \code{FeatherReader$create()} factory method instantiates the object and
-takes the following arguments:
+takes the following argument:
 \itemize{
 \item \code{file} an Arrow file connection object inheriting from \code{RandomAccessFile}.
-\item \code{mmap} Logical: whether to memory-map the file (default \code{TRUE})
-\item \code{...} Additional arguments, currently ignored
 }
 }
 
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
index 9bb6f9b0608..795027e1f24 100644
--- a/r/man/FileFormat.Rd
+++ b/r/man/FileFormat.Rd
@@ -35,11 +35,20 @@ to reduce memory overhead. Disabled by default.
 \item \code{dict_columns}: Names of columns which should be read as dictionaries.
 }
 
-\code{format = "text"}: see \link{CsvReadOptions}. Note that you can specify them either
+\code{format = "text"}: see \link{CsvParseOptions}. Note that you can specify them either
 with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
 \code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
 Not all \code{readr} options are currently supported; please file an issue if
-you encounter one that \code{arrow} should support.
+you encounter one that \code{arrow} should support. Also, the following options are
+supported. From \link{CsvReadOptions}:
+\itemize{
+\item \code{skip_rows}
+\item \code{column_names}
+\item \code{autogenerate_column_names}
+From \link{CsvFragmentScanOptions} (these values can be overridden at scan time):
+\item \code{convert_options}: a \link{CsvConvertOptions}
+\item \code{block_size}
+}
 }
 
 It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
diff --git a/r/man/FragmentScanOptions.Rd b/r/man/FragmentScanOptions.Rd
new file mode 100644
index 00000000000..8bafbb0b21c
--- /dev/null
+++ b/r/man/FragmentScanOptions.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-format.R
+\name{FragmentScanOptions}
+\alias{FragmentScanOptions}
+\alias{CsvFragmentScanOptions}
+\title{Format-specific scan options}
+\description{
+A \code{FragmentScanOptions} holds options specific to a \code{FileFormat} and a scan
+operation.
+}
+\section{Factory}{
+
+\code{FragmentScanOptions$create()} takes the following arguments:
+\itemize{
+\item \code{format}: A string identifier of the file format. Currently supported values:
+\itemize{
+\item "csv"/"text", aliases for the same format.
+}
+\item \code{...}: Additional format-specific options
+
+\code{format = "text"}: see \link{CsvConvertOptions}. Note that options can only be
+specified with the Arrow C++ library naming. Also, "block_size" from
+\link{CsvReadOptions} may be given.
+}
+
+It returns the appropriate subclass of \code{FragmentScanOptions}
+(e.g. \code{CsvFragmentScanOptions}).
+}
+
diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd
index 06f9f67abe2..184fea99c7f 100644
--- a/r/man/RecordBatch.Rd
+++ b/r/man/RecordBatch.Rd
@@ -57,6 +57,11 @@ of the table if \code{NULL}, the default.
 integers (R vector or Array Array) \code{i}.
 \item \verb{$Filter(i, keep_na = TRUE)}: return an \code{RecordBatch} with rows at positions where logical
 vector (or Arrow boolean Array) \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(names, descending = FALSE)}: return an \code{Array} of integer row
+positions that can be used to rearrange the \code{RecordBatch} in ascending or
+descending order by the first named column, breaking ties with further named
+columns. \code{descending} can be a logical vector of length one or of the same
+length as \code{names}.
 \item \verb{$serialize()}: Returns a raw vector suitable for interprocess communication
 \item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter
 the schema of the record batch.
diff --git a/r/man/Table.Rd b/r/man/Table.Rd
index 14c0b0bf260..98a5c354ced 100644
--- a/r/man/Table.Rd
+++ b/r/man/Table.Rd
@@ -56,6 +56,11 @@ integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray},
 coerced to an R vector before taking.
 \item \verb{$Filter(i, keep_na = TRUE)}: return an \code{Table} with rows at positions where logical
 vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(names, descending = FALSE)}: return an \code{Array} of integer row
+positions that can be used to rearrange the \code{Table} in ascending or descending
+order by the first named column, breaking ties with further named columns.
+\code{descending} can be a logical vector of length one or of the same length as
+\code{names}.
 \item \verb{$serialize(output_stream, ...)}: Write the table to the given
 \link{OutputStream}
 \item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter
diff --git a/r/man/array.Rd b/r/man/array.Rd
index fbc91e4dc35..f65afe9fbc3 100644
--- a/r/man/array.Rd
+++ b/r/man/array.Rd
@@ -71,6 +71,8 @@ until the end of the array.
 (R vector or Array Array) \code{i}.
 \item \verb{$Filter(i, keep_na = TRUE)}: return an \code{Array} with values at positions where logical
 vector (or Arrow boolean Array) \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(descending = FALSE)}: return an \code{Array} of integer positions that can be
+used to rearrange the \code{Array} in ascending or descending order
 \item \verb{$RangeEquals(other, start_idx, end_idx, other_start_idx)} :
 \item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the
 data in the array to change its type.
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 7229e60b649..b06a2696e50 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -616,6 +616,16 @@ BEGIN_CPP11
 END_CPP11
 }
 // compute.cpp
+SEXP compute__GroupBy(cpp11::list arguments, cpp11::list keys, cpp11::list options);
+extern "C" SEXP _arrow_compute__GroupBy(SEXP arguments_sexp, SEXP keys_sexp, SEXP options_sexp){
+BEGIN_CPP11
+	arrow::r::Input<cpp11::list>::type arguments(arguments_sexp);
+	arrow::r::Input<cpp11::list>::type keys(keys_sexp);
+	arrow::r::Input<cpp11::list>::type options(options_sexp);
+	return cpp11::as_sexp(compute__GroupBy(arguments, keys, options));
+END_CPP11
+}
+// compute.cpp
 std::vector<std::string> list_compute_functions();
 extern "C" SEXP _arrow_list_compute_functions(){
 BEGIN_CPP11
@@ -1105,19 +1115,52 @@ extern "C" SEXP _arrow_dataset___IpcFileFormat__Make(){
 
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
-std::shared_ptr<ds::CsvFileFormat> dataset___CsvFileFormat__Make(const std::shared_ptr<arrow::csv::ParseOptions>& parse_options);
-extern "C" SEXP _arrow_dataset___CsvFileFormat__Make(SEXP parse_options_sexp){
+std::shared_ptr<ds::CsvFileFormat> dataset___CsvFileFormat__Make(const std::shared_ptr<arrow::csv::ParseOptions>& parse_options, const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options, const std::shared_ptr<arrow::csv::ReadOptions>& read_options);
+extern "C" SEXP _arrow_dataset___CsvFileFormat__Make(SEXP parse_options_sexp, SEXP convert_options_sexp, SEXP read_options_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<arrow::csv::ParseOptions>&>::type parse_options(parse_options_sexp);
-	return cpp11::as_sexp(dataset___CsvFileFormat__Make(parse_options));
+	arrow::r::Input<const std::shared_ptr<arrow::csv::ConvertOptions>&>::type convert_options(convert_options_sexp);
+	arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type read_options(read_options_sexp);
+	return cpp11::as_sexp(dataset___CsvFileFormat__Make(parse_options, convert_options, read_options));
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_dataset___CsvFileFormat__Make(SEXP parse_options_sexp){
+extern "C" SEXP _arrow_dataset___CsvFileFormat__Make(SEXP parse_options_sexp, SEXP convert_options_sexp, SEXP read_options_sexp){
 	Rf_error("Cannot call dataset___CsvFileFormat__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
 }
 #endif
 
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::string dataset___FragmentScanOptions__type_name(const std::shared_ptr<ds::FragmentScanOptions>& fragment_scan_options);
+extern "C" SEXP _arrow_dataset___FragmentScanOptions__type_name(SEXP fragment_scan_options_sexp){
+BEGIN_CPP11
+	arrow::r::Input<const std::shared_ptr<ds::FragmentScanOptions>&>::type fragment_scan_options(fragment_scan_options_sexp);
+	return cpp11::as_sexp(dataset___FragmentScanOptions__type_name(fragment_scan_options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FragmentScanOptions__type_name(SEXP fragment_scan_options_sexp){
+	Rf_error("Cannot call dataset___FragmentScanOptions__type_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::CsvFragmentScanOptions> dataset___CsvFragmentScanOptions__Make(const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options, const std::shared_ptr<arrow::csv::ReadOptions>& read_options);
+extern "C" SEXP _arrow_dataset___CsvFragmentScanOptions__Make(SEXP convert_options_sexp, SEXP read_options_sexp){
+BEGIN_CPP11
+	arrow::r::Input<const std::shared_ptr<arrow::csv::ConvertOptions>&>::type convert_options(convert_options_sexp);
+	arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type read_options(read_options_sexp);
+	return cpp11::as_sexp(dataset___CsvFragmentScanOptions__Make(convert_options, read_options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___CsvFragmentScanOptions__Make(SEXP convert_options_sexp, SEXP read_options_sexp){
+	Rf_error("Cannot call dataset___CsvFragmentScanOptions__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
 std::shared_ptr<ds::DirectoryPartitioning> dataset___DirectoryPartitioning(const std::shared_ptr<arrow::Schema>& schm);
@@ -1265,6 +1308,23 @@ extern "C" SEXP _arrow_dataset___ScannerBuilder__BatchSize(SEXP sb_sexp, SEXP ba
 }
 #endif
 
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__FragmentScanOptions(const std::shared_ptr<ds::ScannerBuilder>& sb, const std::shared_ptr<ds::FragmentScanOptions>& options);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__FragmentScanOptions(SEXP sb_sexp, SEXP options_sexp){
+BEGIN_CPP11
+	arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+	arrow::r::Input<const std::shared_ptr<ds::FragmentScanOptions>&>::type options(options_sexp);
+	dataset___ScannerBuilder__FragmentScanOptions(sb, options);
+	return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__FragmentScanOptions(SEXP sb_sexp, SEXP options_sexp){
+	Rf_error("Cannot call dataset___ScannerBuilder__FragmentScanOptions(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
 std::shared_ptr<arrow::Schema> dataset___ScannerBuilder__schema(const std::shared_ptr<ds::ScannerBuilder>& sb);
@@ -3733,6 +3793,24 @@ BEGIN_CPP11
 	return cpp11::as_sexp(Scalar__type(s));
 END_CPP11
 }
+// scalar.cpp
+bool Scalar__Equals(const std::shared_ptr<arrow::Scalar>& lhs, const std::shared_ptr<arrow::Scalar>& rhs);
+extern "C" SEXP _arrow_Scalar__Equals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+	arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type lhs(lhs_sexp);
+	arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type rhs(rhs_sexp);
+	return cpp11::as_sexp(Scalar__Equals(lhs, rhs));
+END_CPP11
+}
+// scalar.cpp
+bool Scalar__ApproxEquals(const std::shared_ptr<arrow::Scalar>& lhs, const std::shared_ptr<arrow::Scalar>& rhs);
+extern "C" SEXP _arrow_Scalar__ApproxEquals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+	arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type lhs(lhs_sexp);
+	arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type rhs(rhs_sexp);
+	return cpp11::as_sexp(Scalar__ApproxEquals(lhs, rhs));
+END_CPP11
+}
 // schema.cpp
 std::shared_ptr<arrow::Schema> schema_(const std::vector<std::shared_ptr<arrow::Field>>& fields);
 extern "C" SEXP _arrow_schema_(SEXP fields_sexp){
@@ -4186,6 +4264,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, 
 		{ "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, 
 		{ "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, 
+		{ "_arrow_compute__GroupBy", (DL_FUNC) &_arrow_compute__GroupBy, 3}, 
 		{ "_arrow_list_compute_functions", (DL_FUNC) &_arrow_list_compute_functions, 0}, 
 		{ "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, 
 		{ "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, 
@@ -4222,7 +4301,9 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4}, 
 		{ "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3}, 
 		{ "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, 
-		{ "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 1}, 
+		{ "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3}, 
+		{ "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1}, 
+		{ "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2}, 
 		{ "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 1}, 
 		{ "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 1}, 
 		{ "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 2}, 
@@ -4232,6 +4313,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, 
 		{ "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, 
 		{ "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2}, 
+		{ "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2}, 
 		{ "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, 
 		{ "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, 
 		{ "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, 
@@ -4480,6 +4562,8 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 1}, 
 		{ "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, 
 		{ "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, 
+		{ "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, 
+		{ "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, 
 		{ "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, 
 		{ "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, 
 		{ "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, 
diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 7bcded78f0d..5cf8c7c37d2 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -144,6 +144,33 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
     return out;
   }
 
+  if (func_name == "array_sort_indices") {
+    using Order = arrow::compute::SortOrder;
+    using Options = arrow::compute::ArraySortOptions;
+    // false means descending, true means ascending
+    auto order = cpp11::as_cpp<bool>(options["order"]);
+    auto out =
+        std::make_shared<Options>(Options(order ? Order::Descending : Order::Ascending));
+    return out;
+  }
+
+  if (func_name == "sort_indices") {
+    using Key = arrow::compute::SortKey;
+    using Order = arrow::compute::SortOrder;
+    using Options = arrow::compute::SortOptions;
+    auto names = cpp11::as_cpp<std::vector<std::string>>(options["names"]);
+    // false means descending, true means ascending
+    // cpp11 does not support bool here so use int
+    auto orders = cpp11::as_cpp<std::vector<int>>(options["orders"]);
+    std::vector<Key> keys;
+    for (size_t i = 0; i < names.size(); i++) {
+      keys.push_back(
+          Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending));
+    }
+    auto out = std::make_shared<Options>(Options(keys));
+    return out;
+  }
+
   if (func_name == "min_max") {
     using Options = arrow::compute::MinMaxOptions;
     auto out = std::make_shared<Options>(Options::Defaults());
@@ -199,6 +226,29 @@ SEXP compute__CallFunction(std::string func_name, cpp11::list args, cpp11::list
   return from_datum(std::move(out));
 }
 
+// [[arrow::export]]
+SEXP compute__GroupBy(cpp11::list arguments, cpp11::list keys, cpp11::list options) {
+  // options is a list of pairs: string function name, list of options
+
+  std::vector<std::shared_ptr<arrow::compute::FunctionOptions>> keep_alives;
+  std::vector<arrow::compute::internal::Aggregate> aggregates;
+
+  for (cpp11::list name_opts : options) {
+    auto name = cpp11::as_cpp<std::string>(name_opts[0]);
+    auto opts = make_compute_options(name, name_opts[1]);
+
+    aggregates.push_back(
+        arrow::compute::internal::Aggregate{std::move(name), opts.get()});
+    keep_alives.push_back(std::move(opts));
+  }
+
+  auto datum_arguments = arrow::r::from_r_list<arrow::Datum>(arguments);
+  auto datum_keys = arrow::r::from_r_list<arrow::Datum>(keys);
+  auto out = ValueOrStop(arrow::compute::internal::GroupBy(datum_arguments, datum_keys,
+                                                           aggregates, gc_context()));
+  return from_datum(std::move(out));
+}
+
 // [[arrow::export]]
 std::vector<std::string> list_compute_functions() {
   return arrow::compute::GetFunctionRegistry()->GetFunctionNames();
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 83c7cbb844c..89c3e4d56d8 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -272,12 +272,36 @@ std::shared_ptr<ds::IpcFileFormat> dataset___IpcFileFormat__Make() {
 
 // [[dataset::export]]
 std::shared_ptr<ds::CsvFileFormat> dataset___CsvFileFormat__Make(
-    const std::shared_ptr<arrow::csv::ParseOptions>& parse_options) {
+    const std::shared_ptr<arrow::csv::ParseOptions>& parse_options,
+    const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options,
+    const std::shared_ptr<arrow::csv::ReadOptions>& read_options) {
   auto format = std::make_shared<ds::CsvFileFormat>();
   format->parse_options = *parse_options;
+  auto scan_options = std::make_shared<ds::CsvFragmentScanOptions>();
+  if (convert_options) scan_options->convert_options = *convert_options;
+  if (read_options) scan_options->read_options = *read_options;
+  format->default_fragment_scan_options = std::move(scan_options);
   return format;
 }
 
+// FragmentScanOptions, CsvFragmentScanOptions
+
+// [[dataset::export]]
+std::string dataset___FragmentScanOptions__type_name(
+    const std::shared_ptr<ds::FragmentScanOptions>& fragment_scan_options) {
+  return fragment_scan_options->type_name();
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::CsvFragmentScanOptions> dataset___CsvFragmentScanOptions__Make(
+    const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options,
+    const std::shared_ptr<arrow::csv::ReadOptions>& read_options) {
+  auto options = std::make_shared<ds::CsvFragmentScanOptions>();
+  options->convert_options = *convert_options;
+  options->read_options = *read_options;
+  return options;
+}
+
 // DirectoryPartitioning, HivePartitioning
 
 // [[dataset::export]]
@@ -346,6 +370,13 @@ void dataset___ScannerBuilder__BatchSize(const std::shared_ptr<ds::ScannerBuilde
   StopIfNotOk(sb->BatchSize(batch_size));
 }
 
+// [[dataset::export]]
+void dataset___ScannerBuilder__FragmentScanOptions(
+    const std::shared_ptr<ds::ScannerBuilder>& sb,
+    const std::shared_ptr<ds::FragmentScanOptions>& options) {
+  StopIfNotOk(sb->FragmentScanOptions(options));
+}
+
 // [[dataset::export]]
 std::shared_ptr<arrow::Schema> dataset___ScannerBuilder__schema(
     const std::shared_ptr<ds::ScannerBuilder>& sb) {
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index 2137e0f2ad9..c6d15757e05 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -778,7 +778,7 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
           arrow::dictionary(result_type->index_type(), result_type->value_type(), true);
     }
 
-    return result;
+    return std::make_shared<DictionaryArray>(result->data());
   }
 };
 
diff --git a/r/src/scalar.cpp b/r/src/scalar.cpp
index c7e2a716bde..057e587e7eb 100644
--- a/r/src/scalar.cpp
+++ b/r/src/scalar.cpp
@@ -82,4 +82,16 @@ std::shared_ptr<arrow::DataType> Scalar__type(const std::shared_ptr<arrow::Scala
   return s->type;
 }
 
+// [[arrow::export]]
+bool Scalar__Equals(const std::shared_ptr<arrow::Scalar>& lhs,
+                    const std::shared_ptr<arrow::Scalar>& rhs) {
+  return lhs->Equals(rhs);
+}
+
+// [[arrow::export]]
+bool Scalar__ApproxEquals(const std::shared_ptr<arrow::Scalar>& lhs,
+                          const std::shared_ptr<arrow::Scalar>& rhs) {
+  return lhs->ApproxEquals(*rhs);
+}
+
 #endif
diff --git a/r/tests/testthat.R b/r/tests/testthat.R
index da5f1b1d720..d0f5b1e0d34 100644
--- a/r/tests/testthat.R
+++ b/r/tests/testthat.R
@@ -19,4 +19,9 @@ library(testthat)
 library(arrow)
 library(tibble)
 
-test_check("arrow")
+if (identical(tolower(Sys.getenv("ARROW_R_DEV", "false")), "true")) {
+  arrow_reporter <- MultiReporter$new(list(CheckReporter$new(), LocationReporter$new()))
+} else {
+  arrow_reporter <- check_reporter()
+}
+test_check("arrow", reporter = arrow_reporter)
diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R
index 64a4991b827..a20ec23961e 100644
--- a/r/tests/testthat/helper-arrow.R
+++ b/r/tests/testthat/helper-arrow.R
@@ -30,6 +30,10 @@ MAX_INT <- 2147483647L
 
 # Make sure this is unset
 Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = "")
+
+# use the C locale for string collation (ARROW-12046)
+Sys.setlocale("LC_COLLATE", "C")
+
 # Set English language so that error messages aren't internationalized
 # (R CMD check does this, but in case you're running outside of check)
 Sys.setenv(LANGUAGE = "en")
diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R
index 1dd3a6a79d3..43b5bf0354f 100644
--- a/r/tests/testthat/helper-data.R
+++ b/r/tests/testthat/helper-data.R
@@ -134,3 +134,36 @@ example_with_logical_factors <- tibble::tibble(
     "hey buddy"
   )
 )
+
+# The values in each column of this tibble are in ascending order. There are
+# some ties, so tests should use two or more columns to ensure deterministic
+# sort order. The Arrow C++ library orders strings lexicographically as byte
+# strings. The order of a string array sorted by Arrow will not match the order
+# of an equivalent character vector sorted by R unless you set the R collation
+# locale to "C" by running:
+#   Sys.setlocale("LC_COLLATE", "C")
+# These test scripts set that, but if you are running individual tests you might
+# need to set it manually. When finished, you can restore the default
+# collation locale by running:
+#   Sys.setlocale("LC_COLLATE")
+# In the future, the string collation locale used by the Arrow C++ library might
+# be configurable (ARROW-12046).
+example_data_for_sorting <- tibble::tibble(
+  int = c(-.Machine$integer.max, -101L, -100L, 0L, 0L, 1L, 100L, 1000L, .Machine$integer.max, NA_integer_),
+  dbl = c(-Inf, -.Machine$double.xmax, -.Machine$double.xmin, 0, .Machine$double.xmin, pi, .Machine$double.xmax, Inf, NaN, NA_real_),
+  chr = c("", "", "\"", "&", "ABC", "NULL", "a", "abc", "zzz", NA_character_),
+  lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA), # bool is not supported (ARROW-12016)
+  dttm = lubridate::ymd_hms(c(
+    "0000-01-01 00:00:00",
+    "1919-05-29 13:08:55",
+    "1955-06-20 04:10:42",
+    "1973-06-30 11:38:41",
+    "1987-03-29 12:49:47",
+    "1991-06-11 19:07:01",
+    NA_character_,
+    "2017-08-21 18:26:40",
+    "2017-08-21 18:26:40",
+    "9999-12-31 23:59:59"
+  )),
+  grp = c(rep("A", 5), rep("B", 5))
+)
diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
index 76edea61f57..39cc9e0597a 100644
--- a/r/tests/testthat/helper-expectation.R
+++ b/r/tests/testthat/helper-expectation.R
@@ -121,4 +121,43 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its star
     msg,
     ...
   )
-}
\ No newline at end of file
+}
+
+expect_vector_equal <- function(expr, # A vectorized R expression containing `input` as its input
+                               vec,  # A vector as reference, will make Array/ChunkedArray with
+                               skip_array = NULL, # Msg, if should skip Array test
+                               skip_chunked_array = NULL, # Msg, if should skip ChunkedArray test
+                               ...) {
+  expr <- rlang::enquo(expr)
+  expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = vec)))
+
+  skip_msg <- NULL
+
+  if (is.null(skip_array)) {
+    via_array <- rlang::eval_tidy(
+      expr,
+      rlang::new_data_mask(rlang::env(input = Array$create(vec)))
+    )
+    expect_vector(via_array, expected, ...)
+  } else {
+    skip_msg <- c(skip_msg, skip_array)
+  }
+
+  if (is.null(skip_chunked_array)) {
+    # split input vector into two to exercise ChunkedArray with >1 chunk
+    vec_split <- length(vec) %/% 2
+    vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)]
+    vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)]
+    via_chunked <- rlang::eval_tidy(
+      expr,
+      rlang::new_data_mask(rlang::env(input = ChunkedArray$create(vec1, vec2)))
+    )
+    expect_vector(via_chunked, expected, ...)
+  } else {
+    skip_msg <- c(skip_msg, skip_chunked_array)
+  }
+
+  if (!is.null(skip_msg)) {
+    skip(paste(skip_msg, collpase = "\n"))
+  }
+}
diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R
index 37ba8be2b2c..6fb97da000d 100644
--- a/r/tests/testthat/helper-skip.R
+++ b/r/tests/testthat/helper-skip.R
@@ -15,15 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
+build_features <- c(
+  arrow_info()$capabilities,
+  # Special handling for "uncompressed", for tests that iterate over compressions
+  uncompressed = TRUE
+)
+
 skip_if_not_available <- function(feature) {
-  if (feature == "dataset") {
-    skip_if_not(arrow_with_dataset())
-  } else if (feature == "parquet") {
-    skip_if_not(arrow_with_parquet())
-  } else if (feature == "s3") {
-    skip_if_not(arrow_with_s3())
-  } else if (!codec_is_available(feature)) {
-    skip(paste("Arrow C++ not built with support for", feature))
+  yes <- feature %in% names(build_features) && build_features[feature]
+  if (!yes) {
+    skip(paste("Arrow C++ not built with", feature))
   }
 }
 
diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R
new file mode 100644
index 00000000000..ba38d4ce37e
--- /dev/null
+++ b/r/tests/testthat/test-compute-sort.R
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+context("compute: sorting")
+
+library(dplyr)
+
+# randomize order of rows in test data
+tbl <- slice_sample(example_data_for_sorting, prop = 1L)
+
+test_that("sort(Scalar) is identity function", {
+  int <- Scalar$create(42L)
+  expect_equal(sort(int), int)
+  dbl <- Scalar$create(3.14)
+  expect_equal(sort(dbl), dbl)
+  chr <- Scalar$create("foo")
+  expect_equal(sort(chr), chr)
+})
+
+test_that("Array$SortIndices()", {
+  int <- tbl$int
+  # Remove ties because they could give non-deterministic sort indices, and this
+  # test compares sort indices. Other tests compare sorted values, which are
+  # deterministic in the case of ties.
+  int <- int[!duplicated(int)]
+  expect_equal(
+    Array$create(int)$SortIndices(),
+    Array$create(order(int) - 1L, type = uint64())
+  )
+  # Need to remove NAs because ARROW-12063
+  int <- na.omit(int)
+  expect_equal(
+    Array$create(int)$SortIndices(descending = TRUE),
+    Array$create(rev(order(int)) - 1, type = uint64())
+  )
+})
+
+test_that("ChunkedArray$SortIndices()", {
+  int <- tbl$int
+  # Remove ties because they could give non-deterministic sort indices, and this
+  # test compares sort indices. Other tests compare sorted values, which are
+  # deterministic in the case of ties.
+  int <- int[!duplicated(int)]
+  expect_equal(
+    ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(),
+    Array$create(order(int) - 1L, type = uint64())
+  )
+  # Need to remove NAs because ARROW-12063
+  int <- na.omit(int)
+  expect_equal(
+    ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(descending = TRUE),
+    Array$create(rev(order(int)) - 1, type = uint64())
+  )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on integers", {
+  expect_vector_equal(
+    sort(input),
+    tbl$int
+  )
+  expect_vector_equal(
+    sort(input, na.last = NA),
+    tbl$int
+  )
+  expect_vector_equal(
+    sort(input, na.last = TRUE),
+    tbl$int
+  )
+  expect_vector_equal(
+    sort(input, na.last = FALSE),
+    tbl$int
+  )
+  expect_vector_equal(
+    sort(input, decreasing = TRUE),
+    tbl$int,
+  )
+  expect_vector_equal(
+    sort(input, decreasing = TRUE, na.last = TRUE),
+    tbl$int,
+  )
+  expect_vector_equal(
+    sort(input, decreasing = TRUE, na.last = FALSE),
+    tbl$int,
+  )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on strings", {
+  expect_vector_equal(
+    sort(input, decreasing = TRUE, na.last = FALSE),
+    tbl$chr
+  )
+  expect_vector_equal(
+    sort(input, decreasing = TRUE, na.last = FALSE),
+    tbl$chr
+  )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on floats", {
+  expect_vector_equal(
+    sort(input, decreasing = TRUE, na.last = TRUE),
+    tbl$dbl
+  )
+  expect_vector_equal(
+    sort(input, decreasing = FALSE, na.last = TRUE),
+    tbl$dbl
+  )
+  skip("is.na() evaluates to FALSE on Arrow NaN values (ARROW-12055)")
+  expect_vector_equal(
+    sort(input, decreasing = TRUE, na.last = NA),
+    tbl$dbl
+  )
+  expect_vector_equal(
+    sort(input, decreasing = TRUE, na.last = FALSE),
+    tbl$dbl,
+  )
+  expect_vector_equal(
+    sort(input, decreasing = FALSE, na.last = NA),
+    tbl$dbl
+  )
+  expect_vector_equal(
+    sort(input, decreasing = FALSE, na.last = FALSE),
+    tbl$dbl,
+  )
+})
+
+test_that("Table$SortIndices()", {
+  expect_identical(
+    {
+      x <- tbl %>% Table$create()
+      x$Take(x$SortIndices("chr")) %>% pull(chr)
+    },
+    sort(tbl$chr, na.last = TRUE)
+  )
+  expect_identical(
+    {
+      x <- tbl %>% Table$create()
+      x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE))) %>% collect()
+    },
+    tbl %>% arrange(int, dbl)
+  )
+})
+
+test_that("RecordBatch$SortIndices()", {
+  expect_identical(
+    {
+      x <- tbl %>% record_batch()
+      x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE)) %>% collect()
+    },
+    tbl %>% arrange(desc(chr), desc(int), desc(dbl))
+  )
+})
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 67fd5004320..2cc14c5f16d 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -295,6 +295,45 @@ test_that("CSV dataset", {
   )
 })
 
+test_that("CSV scan options", {
+  options <- FragmentScanOptions$create("text")
+  expect_equal(options$type, "csv")
+  options <- FragmentScanOptions$create("csv",
+                                        null_values = c("mynull"),
+                                        strings_can_be_null = TRUE)
+  expect_equal(options$type, "csv")
+
+  dst_dir <- make_temp_dir()
+  dst_file <- file.path(dst_dir, "data.csv")
+  df <- tibble(chr = c("foo", "mynull"))
+  write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+  ds <- open_dataset(dst_dir, format = "csv")
+  expect_equivalent(ds %>% collect(), df)
+
+  sb <- ds$NewScan()
+  sb$FragmentScanOptions(options)
+
+  tab <- sb$Finish()$ToTable()
+  expect_equivalent(as.data.frame(tab), tibble(chr = c("foo", NA)))
+
+  # Set default convert options in CsvFileFormat
+  csv_format <- CsvFileFormat$create(null_values = c("mynull"),
+                                     strings_can_be_null = TRUE)
+  ds <- open_dataset(dst_dir, format = csv_format)
+  expect_equivalent(ds %>% collect(), tibble(chr = c("foo", NA)))
+
+  # Set both parse and convert options
+  df <- tibble(chr = c("foo", "mynull"), chr2 = c("bar", "baz"))
+  write.table(df, dst_file, row.names = FALSE, quote = FALSE, sep = "\t")
+  ds <- open_dataset(dst_dir, format = "csv",
+                     delimiter="\t",
+                     null_values = c("mynull"),
+                     strings_can_be_null = TRUE)
+  expect_equivalent(ds %>% collect(), tibble(chr = c("foo", NA),
+                                             chr2 = c("bar", "baz")))
+})
+
 test_that("compressed CSV dataset", {
   skip_if_not_available("gzip")
   dst_dir <- make_temp_dir()
@@ -318,6 +357,33 @@ test_that("compressed CSV dataset", {
   )
 })
 
+test_that("CSV dataset options", {
+  dst_dir <- make_temp_dir()
+  dst_file <- file.path(dst_dir, "data.csv")
+  df <- tibble(chr = letters[1:10])
+  write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+  format <- FileFormat$create("csv", skip_rows = 1)
+  ds <- open_dataset(dst_dir, format = format)
+
+  expect_equivalent(
+    ds %>%
+      select(string = a) %>%
+      collect(),
+    df1[-1,] %>%
+      select(string = chr)
+  )
+
+  ds <- open_dataset(dst_dir, format = "csv", column_names = c("foo"))
+
+  expect_equivalent(
+    ds %>%
+      select(string = foo) %>%
+      collect(),
+    tibble(foo = c(c('chr'), letters[1:10]))
+  )
+})
+
 test_that("Other text delimited dataset", {
   ds1 <- open_dataset(tsv_dir, partitioning = "part", format = "tsv")
   expect_equivalent(
@@ -593,6 +659,7 @@ test_that("filter() with strings", {
     tibble(chr = "b", part = 1)
   )
 
+  skip_if_not_available("utf8proc")
   expect_equivalent(
     ds %>%
       select(chr, part) %>%
@@ -964,6 +1031,42 @@ test_that("count()", {
   )
 })
 
+test_that("arrange()", {
+  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+  arranged <- ds %>%
+    select(chr, dbl, int) %>%
+    filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
+    mutate(twice = int * 2) %>%
+    arrange(chr, desc(twice), dbl + int)
+  expect_output(
+    print(arranged),
+    "FileSystemDataset (query)
+chr: string
+dbl: double
+int: int32
+twice: expr
+
+* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3))
+* Sorted by chr [asc], multiply_checked(int, 2) [desc], add_checked(dbl, int) [asc]
+See $.data for the source Arrow object",
+    fixed = TRUE
+  )
+  expect_equivalent(
+    arranged %>%
+      collect(),
+    rbind(
+      df1[8, c("chr", "dbl", "int")],
+      df2[2, c("chr", "dbl", "int")],
+      df1[9, c("chr", "dbl", "int")],
+      df2[1, c("chr", "dbl", "int")],
+      df1[10, c("chr", "dbl", "int")]
+    ) %>%
+      mutate(
+        twice = int * 2
+      )
+  )
+})
+
 test_that("head/tail", {
   skip_if_not_available("parquet")
   ds <- open_dataset(dataset_dir)
@@ -1051,7 +1154,6 @@ test_that("dplyr method not implemented messages", {
   expect_not_implemented <- function(x) {
     expect_error(x, "is not currently implemented for Arrow Datasets")
   }
-  expect_not_implemented(ds %>% arrange(int))
   expect_not_implemented(ds %>% filter(int == 1) %>% summarize(n()))
 })
 
diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R
new file mode 100644
index 00000000000..b476c032945
--- /dev/null
+++ b/r/tests/testthat/test-dplyr-arrange.R
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(dplyr)
+
+# randomize order of rows in test data
+tbl <- slice_sample(example_data_for_sorting, prop = 1L)
+
+test_that("arrange() on integer, double, and character columns", {
+  expect_dplyr_equal(
+    input %>%
+      arrange(int, chr) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      arrange(int, desc(dbl)) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      arrange(int, desc(desc(dbl))) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      arrange(int) %>%
+      arrange(desc(dbl)) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      arrange(int + dbl, chr) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      mutate(zzz = int + dbl,) %>%
+      arrange(zzz, chr) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      mutate(zzz = int + dbl) %>%
+      arrange(int + dbl, chr) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      mutate(int + dbl) %>%
+      arrange(int + dbl, chr) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(grp) %>%
+      arrange(int, dbl) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(grp) %>%
+      arrange(int, dbl, .by_group = TRUE) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(grp, grp2) %>%
+      arrange(int, dbl, .by_group = TRUE) %>%
+      collect(),
+    tbl %>%
+      mutate(grp2 = ifelse(is.na(lgl), 1L, as.integer(lgl)))
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(grp) %>%
+      arrange(.by_group = TRUE) %>%
+      pull(grp),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      arrange() %>%
+      collect(),
+    tbl %>%
+      group_by(grp)
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(grp) %>%
+      arrange() %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      arrange() %>%
+      collect(),
+    tbl
+  )
+  test_sort_col <- "chr"
+  expect_dplyr_equal(
+    input %>%
+      arrange(!!sym(test_sort_col)) %>%
+      collect(),
+    tbl %>%
+      select(chr, lgl)
+  )
+  test_sort_cols <- c("int", "dbl")
+  expect_dplyr_equal(
+    input %>%
+      arrange(!!!syms(test_sort_cols)) %>%
+      collect(),
+    tbl
+  )
+  expect_warning(
+    expect_equal(
+      tbl %>%
+        Table$create() %>%
+        arrange(abs(int), dbl) %>%
+        collect(),
+      tbl %>%
+        arrange(abs(int), dbl) %>%
+        collect()
+    ),
+    "not supported in Arrow",
+    fixed = TRUE
+  )
+})
+
+test_that("arrange() on datetime columns", {
+  expect_dplyr_equal(
+    input %>%
+      arrange(dttm, int) %>%
+      collect(),
+    tbl
+  )
+  skip("Sorting by only a single timestamp column fails (ARROW-12087)")
+  expect_dplyr_equal(
+    input %>%
+      arrange(dttm) %>%
+      collect(),
+    tbl %>%
+      select(dttm, grp)
+  )
+})
+
+test_that("arrange() on logical columns", {
+  skip("Sorting by bool columns is not supported (ARROW-12016)")
+  expect_dplyr_equal(
+    input %>%
+      arrange(lgl, int) %>%
+      collect(),
+    tbl
+  )
+})
+
+test_that("arrange() with bad inputs", {
+  expect_error(
+    tbl %>%
+      Table$create() %>%
+      arrange(1),
+    "does not contain any field names",
+    fixed = TRUE
+  )
+  expect_error(
+    tbl %>%
+      Table$create() %>%
+      arrange(2 + 2),
+    "does not contain any field names",
+    fixed = TRUE
+  )
+  expect_error(
+    tbl %>%
+      Table$create() %>%
+      arrange(aertidjfgjksertyj),
+    "not found",
+    fixed = TRUE
+  )
+  expect_error(
+    tbl %>%
+      Table$create() %>%
+      arrange(desc(aertidjfgjksertyj + iaermxiwerksxsdqq)),
+    "not found",
+    fixed = TRUE
+  )
+})
diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R
index 199ac50a8b0..bac64297c5a 100644
--- a/r/tests/testthat/test-dplyr-filter.R
+++ b/r/tests/testthat/test-dplyr-filter.R
@@ -226,6 +226,8 @@ test_that("filter() with between()", {
 })
 
 test_that("filter() with string ops", {
+  skip_if_not_available("utf8proc")
+  skip_if(getRversion() < "3.4.0", "R < 3.4")
   # Extra instrumentation to ensure that we're calling Arrow compute here
   # because many base R string functions implicitly call as.character,
   # which means they still work on Arrays but actually force data into R
@@ -233,9 +235,6 @@ test_that("filter() with string ops", {
   #    the whole test because as.character apparently gets called in other
   #    (presumably legitimate) places
   # 2) Wrap the test in expect_warning(expr, NA) to catch the warning
-
-  skip_if(getRversion() < "3.4.0", "R < 3.4")
-
   with_no_as_character <- function(expr) {
     trace(
       "as.character",
diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R
new file mode 100644
index 00000000000..6f5d5672d19
--- /dev/null
+++ b/r/tests/testthat/test-dplyr-group-by.R
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(dplyr)
+library(stringr)
+
+tbl <- example_data
+
+test_that("group_by groupings are recorded", {
+  expect_dplyr_equal(
+    input %>%
+      group_by(chr) %>%
+      select(int, chr) %>%
+      filter(int > 5) %>%
+      summarize(min_int = min(int)),
+    tbl
+  )
+})
+
+test_that("group_by doesn't yet support creating/renaming", {
+  expect_dplyr_equal(
+    input %>%
+      group_by(chr, numbers = int) %>%
+      collect(),
+    tbl
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(chr, numbers = int * 4) %>%
+      collect(),
+    tbl
+  )
+})
+
+test_that("ungroup", {
+  expect_dplyr_equal(
+    input %>%
+      group_by(chr) %>%
+      select(int, chr) %>%
+      ungroup() %>%
+      filter(int > 5) %>%
+      summarize(min_int = min(int)),
+    tbl
+  )
+})
+
+test_that("group_by then rename", {
+  expect_dplyr_equal(
+    input %>%
+      group_by(chr) %>%
+      select(string = chr, int) %>%
+      collect(),
+    tbl
+  )
+})
+
+test_that("group_by with .drop", {
+  test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog")
+  expect_dplyr_equal(
+    input %>%
+      group_by(!!!syms(test_groups), .drop = TRUE) %>%
+      collect(),
+    example_with_logical_factors
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(!!!syms(test_groups), .drop = FALSE) %>%
+      collect(),
+    example_with_logical_factors
+  )
+  expect_equal(
+    example_with_logical_factors %>%
+      group_by(!!!syms(test_groups), .drop = TRUE) %>%
+      collect() %>%
+      n_groups(),
+    4L
+  )
+  expect_equal(
+    example_with_logical_factors %>%
+      group_by(!!!syms(test_groups), .drop = FALSE) %>%
+      collect() %>%
+      n_groups(),
+    8L
+  )
+  expect_equal(
+    example_with_logical_factors %>%
+      group_by(!!!syms(test_groups), .drop = FALSE) %>%
+      group_by_drop_default(),
+    FALSE
+  )
+  expect_equal(
+    example_with_logical_factors %>%
+      group_by(!!!syms(test_groups), .drop = TRUE) %>%
+      group_by_drop_default(),
+    TRUE
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(.drop = FALSE) %>% # no group by vars
+      group_by_drop_default(),
+    example_with_logical_factors
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by_drop_default(),
+    example_with_logical_factors
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(!!!syms(test_groups)) %>%
+      group_by_drop_default(),
+    example_with_logical_factors
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(!!!syms(test_groups), .drop = FALSE) %>%
+      ungroup() %>%
+      group_by_drop_default(),
+    example_with_logical_factors
+  )
+})
diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R
index 0ec49bbd842..662f6d7478a 100644
--- a/r/tests/testthat/test-dplyr-mutate.R
+++ b/r/tests/testthat/test-dplyr-mutate.R
@@ -93,9 +93,9 @@ test_that("empty transmute()", {
 test_that("mutate and refer to previous mutants", {
   expect_dplyr_equal(
     input %>%
-      select(int, padded_strings) %>%
+      select(int, verses) %>%
       mutate(
-        line_lengths = nchar(padded_strings),
+        line_lengths = nchar(verses),
         longer = line_lengths * 10
       ) %>%
       filter(line_lengths > 15) %>%
@@ -104,12 +104,40 @@ test_that("mutate and refer to previous mutants", {
   )
 })
 
+test_that("nchar() arguments", {
+  expect_dplyr_equal(
+    input %>%
+      select(int, verses) %>%
+      mutate(
+        line_lengths = nchar(verses, type = "bytes"),
+        longer = line_lengths * 10
+      ) %>%
+      filter(line_lengths > 15) %>%
+      collect(),
+    tbl
+  )
+  expect_warning(
+    expect_dplyr_equal(
+      input %>%
+        select(int, verses) %>%
+        mutate(
+          line_lengths = nchar(verses, type = "bytes", allowNA = TRUE),
+          longer = line_lengths * 10
+        ) %>%
+        filter(line_lengths > 15) %>%
+        collect(),
+      tbl
+    ),
+    "not supported"
+  )
+})
+
 test_that("mutate with .data pronoun", {
   expect_dplyr_equal(
     input %>%
-      select(int, padded_strings) %>%
+      select(int, verses) %>%
       mutate(
-        line_lengths = nchar(padded_strings),
+        line_lengths = str_length(verses),
         longer = .data$line_lengths * 10
       ) %>%
       filter(line_lengths > 15) %>%
@@ -221,23 +249,17 @@ test_that("dplyr::mutate's examples", {
   #>       x     y     z
   #>   <dbl> <dbl> <dbl>
   #> 1     1     2     3
-  expect_warning(
-    expect_dplyr_equal(
-      input %>% mutate(z = x + y, .before = 1) %>% collect(),
-      df
-    ),
-    "not supported in Arrow"
+  expect_dplyr_equal(
+    input %>% mutate(z = x + y, .before = 1) %>% collect(),
+    df
   )
   #> # A tibble: 1 x 3
   #>       z     x     y
   #>   <dbl> <dbl> <dbl>
   #> 1     3     1     2
-  expect_warning(
-    expect_dplyr_equal(
-      input %>% mutate(z = x + y, .after = x) %>% collect(),
-      df
-    ),
-    "not supported in Arrow"
+  expect_dplyr_equal(
+    input %>% mutate(z = x + y, .after = x) %>% collect(),
+    df
   )
   #> # A tibble: 1 x 3
   #>       x     z     y
diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
index 460d4bbdba5..69f0b5827fc 100644
--- a/r/tests/testthat/test-dplyr.R
+++ b/r/tests/testthat/test-dplyr.R
@@ -85,41 +85,6 @@ test_that("summarize", {
   )
 })
 
-test_that("group_by groupings are recorded", {
-  expect_dplyr_equal(
-    input %>%
-      group_by(chr) %>%
-      select(int, chr) %>%
-      filter(int > 5) %>%
-      summarize(min_int = min(int)),
-    tbl
-  )
-  # Test that the original object is not affected
-  expect_identical(collect(batch), tbl)
-})
-
-test_that("group_by doesn't yet support creating/renaming", {
-  expect_error(
-    record_batch(tbl) %>%
-      group_by(chr, numbers = int),
-    "Cannot create or rename columns in group_by on Arrow objects"
-  )
-})
-
-test_that("ungroup", {
-  expect_dplyr_equal(
-    input %>%
-      group_by(chr) %>%
-      select(int, chr) %>%
-      ungroup() %>%
-      filter(int > 5) %>%
-      summarize(min_int = min(int)),
-    tbl
-  )
-  # Test that the original object is not affected
-  expect_identical(collect(batch), tbl)
-})
-
 test_that("Empty select returns no columns", {
   expect_dplyr_equal(
     input %>% select() %>% collect(),
@@ -134,17 +99,6 @@ test_that("Empty select still includes the group_by columns", {
   )
 })
 
-test_that("arrange", {
-  expect_dplyr_equal(
-    input %>%
-      group_by(chr) %>%
-      select(int, chr) %>%
-      arrange(desc(int)) %>%
-      collect(),
-    tbl
-  )
-})
-
 test_that("select/rename", {
   expect_dplyr_equal(
     input %>%
@@ -167,6 +121,21 @@ test_that("select/rename", {
   )
 })
 
+test_that("select/rename with selection helpers", {
+
+  # TODO: add some passing tests here
+
+  expect_error(
+    expect_dplyr_equal(
+      input %>%
+        select(where(is.numeric)) %>%
+        collect(),
+      tbl
+    ),
+    "Unsupported selection helper"
+  )
+})
+
 test_that("filtering with rename", {
   expect_dplyr_equal(
     input %>%
@@ -184,82 +153,6 @@ test_that("filtering with rename", {
   )
 })
 
-test_that("group_by then rename", {
-  expect_dplyr_equal(
-    input %>%
-      group_by(chr) %>%
-      select(string = chr, int) %>%
-      collect(),
-    tbl
-  )
-})
-
-test_that("group_by with .drop", {
-  test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog")
-  expect_dplyr_equal(
-    input %>%
-      group_by(!!!syms(test_groups), .drop = TRUE) %>%
-      collect(),
-    example_with_logical_factors
-  )
-  expect_dplyr_equal(
-    input %>%
-      group_by(!!!syms(test_groups), .drop = FALSE) %>%
-      collect(),
-    example_with_logical_factors
-  )
-  expect_equal(
-    example_with_logical_factors %>%
-      group_by(!!!syms(test_groups), .drop = TRUE) %>%
-      collect() %>%
-      n_groups(),
-    4L
-  )
-  expect_equal(
-    example_with_logical_factors %>%
-      group_by(!!!syms(test_groups), .drop = FALSE) %>%
-      collect() %>%
-      n_groups(),
-    8L
-  )
-  expect_equal(
-    example_with_logical_factors %>%
-      group_by(!!!syms(test_groups), .drop = FALSE) %>%
-      group_by_drop_default(),
-    FALSE
-  )
-  expect_equal(
-    example_with_logical_factors %>%
-      group_by(!!!syms(test_groups), .drop = TRUE) %>%
-      group_by_drop_default(),
-    TRUE
-  )
-  expect_dplyr_equal(
-    input %>%
-      group_by(.drop = FALSE) %>% # no group by vars
-      group_by_drop_default(),
-    example_with_logical_factors
-  )
-  expect_dplyr_equal(
-    input %>%
-      group_by_drop_default(),
-    example_with_logical_factors
-  )
-  expect_dplyr_equal(
-    input %>%
-      group_by(!!!syms(test_groups)) %>%
-      group_by_drop_default(),
-    example_with_logical_factors
-  )
-  expect_dplyr_equal(
-    input %>%
-      group_by(!!!syms(test_groups), .drop = FALSE) %>%
-      ungroup() %>%
-      group_by_drop_default(),
-    example_with_logical_factors
-  )
-})
-
 test_that("pull", {
   expect_dplyr_equal(
     input %>% pull(),
@@ -380,3 +273,46 @@ test_that("tail", {
       group_by(int)
     )
 })
+
+test_that("relocate", {
+  df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
+  expect_dplyr_equal(
+    input %>% relocate(f) %>% collect(),
+    df,
+  )
+  expect_dplyr_equal(
+    input %>% relocate(a, .after = c) %>% collect(),
+    df,
+  )
+  expect_dplyr_equal(
+    input %>% relocate(f, .before = b) %>% collect(),
+    df,
+  )
+  expect_dplyr_equal(
+    input %>% relocate(a, .after = last_col()) %>% collect(),
+    df,
+  )
+  expect_dplyr_equal(
+    input %>% relocate(ff = f) %>% collect(),
+    df,
+  )
+})
+
+test_that("relocate with selection helpers", {
+  expect_dplyr_equal(
+    input %>% relocate(any_of(c("a", "e", "i", "o", "u"))) %>% collect(),
+    df
+  )
+  expect_error(
+    df %>% Table$create() %>% relocate(where(is.character)),
+    "Unsupported selection helper"
+  )
+  expect_error(
+    df %>% Table$create() %>% relocate(a, b, c, .after = where(is.character)),
+    "Unsupported selection helper"
+  )
+  expect_error(
+    df %>% Table$create() %>% relocate(d, e, f, .before = where(is.numeric)),
+    "Unsupported selection helper"
+  )
+})
diff --git a/r/tests/testthat/test-scalar.R b/r/tests/testthat/test-scalar.R
index ce0f53d2be3..e9ef893bbd9 100644
--- a/r/tests/testthat/test-scalar.R
+++ b/r/tests/testthat/test-scalar.R
@@ -53,4 +53,26 @@ test_that("Scalar to Array", {
   a <- Scalar$create(42)
   expect_equal(a$as_array(), Array$create(42))
   expect_equal(Array$create(a), Array$create(42))
-})
\ No newline at end of file
+})
+
+test_that("Scalar$Equals", {
+  a <- Scalar$create(42)
+  aa <- Array$create(42)
+  b <- Scalar$create(42)
+  d <- Scalar$create(43)
+  expect_equal(a, b)
+  expect_true(a$Equals(b))
+  expect_false(a$Equals(d))
+  expect_false(a$Equals(aa))
+})
+
+test_that("Scalar$ApproxEquals", {
+  a <- Scalar$create(1.0000000000001)
+  aa <- Array$create(1.0000000000001)
+  b <- Scalar$create(1.0)
+  d <- 2.400000000000001
+  expect_false(a$Equals(b))
+  expect_true(a$ApproxEquals(b))
+  expect_false(a$ApproxEquals(d))
+  expect_false(a$ApproxEquals(aa))
+})
diff --git a/run-cmake-format.py b/run-cmake-format.py
index 5e8da5c5471..1ff103868d8 100755
--- a/run-cmake-format.py
+++ b/run-cmake-format.py
@@ -17,62 +17,36 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import argparse
+import fnmatch
 import hashlib
 import pathlib
 import subprocess
 import sys
 
-
-patterns = [
+# Keep an explicit list of files to format as we don't want to reformat
+# files we imported from other location.
+PATTERNS = [
+    'ci/**/*.cmake',
     'cpp/CMakeLists.txt',
-    # Keep an explicit list of files to format as we don't want to reformat
-    # files we imported from other location.
-    'cpp/cmake_modules/BuildUtils.cmake',
-    'cpp/cmake_modules/DefineOptions.cmake',
-    'cpp/cmake_modules/FindArrow.cmake',
-    'cpp/cmake_modules/FindArrowCUDA.cmake',
-    'cpp/cmake_modules/FindArrowDataset.cmake',
-    'cpp/cmake_modules/FindArrowFlight.cmake',
-    'cpp/cmake_modules/FindArrowFlightTesting.cmake',
-    'cpp/cmake_modules/FindArrowPython.cmake',
-    'cpp/cmake_modules/FindArrowPythonFlight.cmake',
-    'cpp/cmake_modules/FindArrowTesting.cmake',
-    'cpp/cmake_modules/FindBrotli.cmake',
-    'cpp/cmake_modules/FindClangTools.cmake',
-    'cpp/cmake_modules/FindFlatbuffersAlt.cmake',
-    'cpp/cmake_modules/FindGLOG.cmake',
-    'cpp/cmake_modules/FindGandiva.cmake',
-    'cpp/cmake_modules/FindInferTools.cmake',
-    'cpp/cmake_modules/FindLLVMAlt.cmake',
-    'cpp/cmake_modules/FindLz4.cmake',
-    'cpp/cmake_modules/FindParquet.cmake',
-    'cpp/cmake_modules/FindPlasma.cmake',
-    'cpp/cmake_modules/FindPython3Alt.cmake',
-    'cpp/cmake_modules/FindRE2.cmake',
-    'cpp/cmake_modules/FindRapidJSONAlt.cmake',
-    'cpp/cmake_modules/FindSnappyAlt.cmake',
-    'cpp/cmake_modules/FindThrift.cmake',
-    'cpp/cmake_modules/FindZSTD.cmake',
-    'cpp/cmake_modules/Findc-aresAlt.cmake',
-    'cpp/cmake_modules/FindgRPCAlt.cmake',
-    'cpp/cmake_modules/FindgflagsAlt.cmake',
-    'cpp/cmake_modules/Findjemalloc.cmake',
-    'cpp/cmake_modules/SetupCxxFlags.cmake',
-    'cpp/cmake_modules/ThirdpartyToolchain.cmake',
-    'cpp/cmake_modules/san-config.cmake',
-    'cpp/cmake_modules/UseCython.cmake',
-    'cpp/cmake_modules/Usevcpkg.cmake',
     'cpp/src/**/CMakeLists.txt',
-    'cpp/tools/**/CMakeLists.txt',
-    'java/gandiva/CMakeLists.txt',
-    'python/CMakeLists.txt',
+    'cpp/cmake_modules/*.cmake',
+    'go/**/CMakeLists.txt',
+    'java/**/CMakeLists.txt',
+    'matlab/**/CMakeLists.txt',
+]
+EXCLUDE = [
+    'cpp/cmake_modules/FindNumPy.cmake',
+    'cpp/cmake_modules/FindPythonLibsNew.cmake',
+    'cpp/cmake_modules/UseCython.cmake',
+    'cpp/src/arrow/util/config.h.cmake',
 ]
 
 here = pathlib.Path(__file__).parent
 
 
 def find_cmake_files():
-    for pat in patterns:
+    for pat in PATTERNS:
         yield from here.glob(pat)
 
 
@@ -119,8 +93,19 @@ def check_cmake_format(paths):
 
 
 if __name__ == "__main__":
-    paths = list(find_cmake_files())
-    if "--check" in sys.argv:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--check', action='store_true')
+    parser.add_argument('paths', nargs='*', type=pathlib.Path)
+    args = parser.parse_args()
+
+    paths = find_cmake_files()
+    if args.paths:
+        paths = set(paths) & set([path.resolve() for path in args.paths])
+    paths = [
+        path for path in paths
+        if path.relative_to(here).as_posix() not in EXCLUDE
+    ]
+    if args.check:
         check_cmake_format(paths)
     else:
         run_cmake_format(paths)
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 16e34de7f14..c29900429b2 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -22,6 +22,7 @@ members = [
         "parquet_derive",
         "parquet_derive_test",
         "datafusion",
+        "datafusion-examples",
         "arrow-flight",
         "integration-testing",
 	"benchmarks",
diff --git a/rust/arrow/benches/aggregate_kernels.rs b/rust/arrow/benches/aggregate_kernels.rs
index 1e18f0b2830..1724b7349c5 100644
--- a/rust/arrow/benches/aggregate_kernels.rs
+++ b/rust/arrow/benches/aggregate_kernels.rs
@@ -54,10 +54,10 @@ fn add_benchmark(c: &mut Criterion) {
     c.bench_function("min nulls 512", |b| b.iter(|| bench_min(&arr_a)));
     c.bench_function("max nulls 512", |b| b.iter(|| bench_max(&arr_a)));
 
-    let arr_b = create_string_array(512, 0.0);
+    let arr_b = create_string_array::<i32>(512, 0.0);
     c.bench_function("min string 512", |b| b.iter(|| bench_min_string(&arr_b)));
 
-    let arr_b = create_string_array(512, 0.5);
+    let arr_b = create_string_array::<i32>(512, 0.5);
     c.bench_function("min nulls string 512", |b| {
         b.iter(|| bench_min_string(&arr_b))
     });
diff --git a/rust/arrow/benches/comparison_kernels.rs b/rust/arrow/benches/comparison_kernels.rs
index 6b8fd73b7ab..a3df556efcf 100644
--- a/rust/arrow/benches/comparison_kernels.rs
+++ b/rust/arrow/benches/comparison_kernels.rs
@@ -121,10 +121,10 @@ fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
 
 fn add_benchmark(c: &mut Criterion) {
     let size = 65536;
-    let arr_a = create_primitive_array::<Float32Type>(size, 0.0);
-    let arr_b = create_primitive_array::<Float32Type>(size, 0.0);
+    let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
+    let arr_b = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 43);
 
-    let arr_string = create_string_array(size, 0.0);
+    let arr_string = create_string_array::<i32>(size, 0.0);
 
     c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b)));
     c.bench_function("eq scalar Float32", |b| {
diff --git a/rust/arrow/benches/concatenate_kernel.rs b/rust/arrow/benches/concatenate_kernel.rs
index 408e8d712f6..3fff2abd179 100644
--- a/rust/arrow/benches/concatenate_kernel.rs
+++ b/rust/arrow/benches/concatenate_kernel.rs
@@ -51,12 +51,12 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_concat_arrays(&arrays))
     });
 
-    let v1 = create_string_array(1024, 0.0);
-    let v2 = create_string_array(1024, 0.0);
+    let v1 = create_string_array::<i32>(1024, 0.0);
+    let v2 = create_string_array::<i32>(1024, 0.0);
     c.bench_function("concat str 1024", |b| b.iter(|| bench_concat(&v1, &v2)));
 
-    let v1 = create_string_array(1024, 0.5);
-    let v2 = create_string_array(1024, 0.5);
+    let v1 = create_string_array::<i32>(1024, 0.5);
+    let v2 = create_string_array::<i32>(1024, 0.5);
     c.bench_function("concat str nulls 1024", |b| {
         b.iter(|| bench_concat(&v1, &v2))
     });
diff --git a/rust/arrow/benches/equal.rs b/rust/arrow/benches/equal.rs
index 648eaaadd69..af535506e86 100644
--- a/rust/arrow/benches/equal.rs
+++ b/rust/arrow/benches/equal.rs
@@ -38,10 +38,10 @@ fn add_benchmark(c: &mut Criterion) {
     let arr_a_nulls = create_primitive_array::<Float32Type>(512, 0.5);
     c.bench_function("equal_nulls_512", |b| b.iter(|| bench_equal(&arr_a_nulls)));
 
-    let arr_a = create_string_array(512, 0.0);
+    let arr_a = create_string_array::<i32>(512, 0.0);
     c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a)));
 
-    let arr_a_nulls = create_string_array(512, 0.5);
+    let arr_a_nulls = create_string_array::<i32>(512, 0.5);
     c.bench_function("equal_string_nulls_512", |b| {
         b.iter(|| bench_equal(&arr_a_nulls))
     });
diff --git a/rust/arrow/benches/filter_kernels.rs b/rust/arrow/benches/filter_kernels.rs
index 479f9abe155..ca317b4676c 100644
--- a/rust/arrow/benches/filter_kernels.rs
+++ b/rust/arrow/benches/filter_kernels.rs
@@ -90,7 +90,7 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_built_filter(&sparse_filter, &data_array))
     });
 
-    let data_array = create_string_array(size, 0.5);
+    let data_array = create_string_array::<i32>(size, 0.5);
     c.bench_function("filter context string", |b| {
         b.iter(|| bench_built_filter(&filter, &data_array))
     });
diff --git a/rust/arrow/benches/mutable_array.rs b/rust/arrow/benches/mutable_array.rs
index 50afde87a50..52da38a1d54 100644
--- a/rust/arrow/benches/mutable_array.rs
+++ b/rust/arrow/benches/mutable_array.rs
@@ -47,11 +47,11 @@ fn bench<T: Array>(v1: &T, slices: &[(usize, usize)]) {
 }
 
 fn add_benchmark(c: &mut Criterion) {
-    let v1 = create_string_array(1024, 0.0);
+    let v1 = create_string_array::<i32>(1024, 0.0);
     let v2 = create_slices(1024);
     c.bench_function("mutable str 1024", |b| b.iter(|| bench(&v1, &v2)));
 
-    let v1 = create_string_array(1024, 0.5);
+    let v1 = create_string_array::<i32>(1024, 0.5);
     let v2 = create_slices(1024);
     c.bench_function("mutable str nulls 1024", |b| b.iter(|| bench(&v1, &v2)));
 }
diff --git a/rust/arrow/benches/take_kernels.rs b/rust/arrow/benches/take_kernels.rs
index 84891f75e0d..2853eb5d476 100644
--- a/rust/arrow/benches/take_kernels.rs
+++ b/rust/arrow/benches/take_kernels.rs
@@ -88,36 +88,36 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_take(&values, &indices))
     });
 
-    let values = create_string_array(512, 0.0);
+    let values = create_string_array::<i32>(512, 0.0);
     let indices = create_random_index(512, 0.0);
     c.bench_function("take str 512", |b| b.iter(|| bench_take(&values, &indices)));
 
-    let values = create_string_array(1024, 0.0);
+    let values = create_string_array::<i32>(1024, 0.0);
     let indices = create_random_index(1024, 0.0);
     c.bench_function("take str 1024", |b| {
         b.iter(|| bench_take(&values, &indices))
     });
 
-    let values = create_string_array(512, 0.0);
+    let values = create_string_array::<i32>(512, 0.0);
     let indices = create_random_index(512, 0.5);
     c.bench_function("take str null indices 512", |b| {
         b.iter(|| bench_take(&values, &indices))
     });
 
-    let values = create_string_array(1024, 0.0);
+    let values = create_string_array::<i32>(1024, 0.0);
     let indices = create_random_index(1024, 0.5);
     c.bench_function("take str null indices 1024", |b| {
         b.iter(|| bench_take(&values, &indices))
     });
 
-    let values = create_string_array(1024, 0.5);
+    let values = create_string_array::<i32>(1024, 0.5);
 
     let indices = create_random_index(1024, 0.0);
     c.bench_function("take str null values 1024", |b| {
         b.iter(|| bench_take(&values, &indices))
     });
 
-    let values = create_string_array(1024, 0.5);
+    let values = create_string_array::<i32>(1024, 0.5);
     let indices = create_random_index(1024, 0.5);
     c.bench_function("take str null values null indices 1024", |b| {
         b.iter(|| bench_take(&values, &indices))
diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs
index 588e3bed6d5..9fdc0be33d8 100644
--- a/rust/arrow/src/array/array_primitive.rs
+++ b/rust/arrow/src/array/array_primitive.rs
@@ -112,6 +112,22 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
         );
         PrimitiveArray::from(data)
     }
+
+    /// Creates a PrimitiveArray based on a constant value with `count` elements
+    pub fn from_value(value: T::Native, count: usize) -> Self {
+        // # Safety: length is known
+        let val_buf = unsafe { Buffer::from_trusted_len_iter((0..count).map(|_| value)) };
+        let data = ArrayData::new(
+            T::DATA_TYPE,
+            val_buf.len() / mem::size_of::<<T as ArrowPrimitiveType>::Native>(),
+            None,
+            None,
+            0,
+            vec![val_buf],
+            vec![],
+        );
+        PrimitiveArray::from(data)
+    }
 }
 
 impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs
index c77577acc5f..38df92ebb46 100644
--- a/rust/arrow/src/array/builder.rs
+++ b/rust/arrow/src/array/builder.rs
@@ -433,6 +433,7 @@ impl BooleanBuilder {
     }
 
     /// Appends a value of type `T` into the builder
+    #[inline]
     pub fn append_value(&mut self, v: bool) -> Result<()> {
         self.bitmap_builder.append(true);
         self.values_builder.append(v);
@@ -440,6 +441,7 @@ impl BooleanBuilder {
     }
 
     /// Appends a null slot into the builder
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         self.bitmap_builder.append(false);
         self.values_builder.advance(1);
@@ -447,6 +449,7 @@ impl BooleanBuilder {
     }
 
     /// Appends an `Option<T>` into the builder
+    #[inline]
     pub fn append_option(&mut self, v: Option<bool>) -> Result<()> {
         match v {
             None => self.append_null()?,
@@ -456,6 +459,7 @@ impl BooleanBuilder {
     }
 
     /// Appends a slice of type `T` into the builder
+    #[inline]
     pub fn append_slice(&mut self, v: &[bool]) -> Result<()> {
         self.bitmap_builder.append_n(v.len(), true);
         self.values_builder.append_slice(v);
@@ -463,6 +467,7 @@ impl BooleanBuilder {
     }
 
     /// Appends values from a slice of type `T` and a validity boolean slice
+    #[inline]
     pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<()> {
         if values.len() != is_valid.len() {
             return Err(ArrowError::InvalidArgumentError(
@@ -578,6 +583,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     }
 
     /// Appends a value of type `T` into the builder
+    #[inline]
     pub fn append_value(&mut self, v: T::Native) -> Result<()> {
         if let Some(b) = self.bitmap_builder.as_mut() {
             b.append(true);
@@ -587,6 +593,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     }
 
     /// Appends a null slot into the builder
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         self.materialize_bitmap_builder();
         self.bitmap_builder.as_mut().unwrap().append(false);
@@ -595,6 +602,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     }
 
     /// Appends an `Option<T>` into the builder
+    #[inline]
     pub fn append_option(&mut self, v: Option<T::Native>) -> Result<()> {
         match v {
             None => self.append_null()?,
@@ -604,6 +612,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     }
 
     /// Appends a slice of type `T` into the builder
+    #[inline]
     pub fn append_slice(&mut self, v: &[T::Native]) -> Result<()> {
         if let Some(b) = self.bitmap_builder.as_mut() {
             b.append_n(v.len(), true);
@@ -613,6 +622,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     }
 
     /// Appends values from a slice of type `T` and a validity boolean slice
+    #[inline]
     pub fn append_values(
         &mut self,
         values: &[T::Native],
@@ -766,6 +776,7 @@ where
     }
 
     /// Finish the current variable-length list array slot
+    #[inline]
     pub fn append(&mut self, is_valid: bool) -> Result<()> {
         self.offsets_builder
             .append(OffsetSize::from_usize(self.values_builder.len()).unwrap());
@@ -897,6 +908,7 @@ where
     }
 
     /// Finish the current variable-length list array slot
+    #[inline]
     pub fn append(&mut self, is_valid: bool) -> Result<()> {
         self.bitmap_builder.append(is_valid);
         self.len += 1;
@@ -1115,6 +1127,7 @@ impl<OffsetSize: BinaryOffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
     ///
     /// Note, when appending individual byte values you must call `append` to delimit each
     /// distinct list value.
+    #[inline]
     pub fn append_byte(&mut self, value: u8) -> Result<()> {
         self.builder.values().append_value(value)?;
         Ok(())
@@ -1124,6 +1137,7 @@ impl<OffsetSize: BinaryOffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
     ///
     /// Automatically calls the `append` method to delimit the slice appended in as a
     /// distinct array element.
+    #[inline]
     pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> {
         self.builder.values().append_slice(value.as_ref())?;
         self.builder.append(true)?;
@@ -1131,11 +1145,13 @@ impl<OffsetSize: BinaryOffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
     }
 
     /// Finish the current variable-length list array slot.
+    #[inline]
     pub fn append(&mut self, is_valid: bool) -> Result<()> {
         self.builder.append(is_valid)
     }
 
     /// Append a null value to the array.
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         self.append(false)
     }
@@ -1170,6 +1186,7 @@ impl<OffsetSize: StringOffsetSizeTrait> GenericStringBuilder<OffsetSize> {
     ///
     /// Automatically calls the `append` method to delimit the string appended in as a
     /// distinct array element.
+    #[inline]
     pub fn append_value(&mut self, value: impl AsRef<str>) -> Result<()> {
         self.builder
             .values()
@@ -1179,11 +1196,13 @@ impl<OffsetSize: StringOffsetSizeTrait> GenericStringBuilder<OffsetSize> {
     }
 
     /// Finish the current variable-length list array slot.
+    #[inline]
     pub fn append(&mut self, is_valid: bool) -> Result<()> {
         self.builder.append(is_valid)
     }
 
     /// Append a null value to the array.
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         self.append(false)
     }
@@ -1208,6 +1227,7 @@ impl FixedSizeBinaryBuilder {
     ///
     /// Automatically calls the `append` method to delimit the slice appended in as a
     /// distinct array element.
+    #[inline]
     pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> {
         if self.builder.value_length() != value.as_ref().len() as i32 {
             return Err(ArrowError::InvalidArgumentError(
@@ -1219,6 +1239,7 @@ impl FixedSizeBinaryBuilder {
     }
 
     /// Append a null value to the array.
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         let length: usize = self.builder.value_length() as usize;
         self.builder.values().append_slice(&vec![0u8; length][..])?;
@@ -1248,6 +1269,7 @@ impl DecimalBuilder {
     ///
     /// Automatically calls the `append` method to delimit the slice appended in as a
     /// distinct array element.
+    #[inline]
     pub fn append_value(&mut self, value: i128) -> Result<()> {
         let value_as_bytes = Self::from_i128_to_fixed_size_bytes(
             value,
@@ -1276,6 +1298,7 @@ impl DecimalBuilder {
     }
 
     /// Append a null value to the array.
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         let length: usize = self.builder.value_length() as usize;
         self.builder.values().append_slice(&vec![0u8; length][..])?;
@@ -1465,6 +1488,7 @@ impl StructBuilder {
 
     /// Appends an element (either null or non-null) to the struct. The actual elements
     /// should be appended for each child sub-array in a consistent way.
+    #[inline]
     pub fn append(&mut self, is_valid: bool) -> Result<()> {
         self.bitmap_builder.append(is_valid);
         self.len += 1;
@@ -1472,6 +1496,7 @@ impl StructBuilder {
     }
 
     /// Appends a null element to the struct.
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         self.append(false)
     }
@@ -1649,6 +1674,7 @@ impl UnionBuilder {
     }
 
     /// Appends a null to this builder.
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         if self.bitmap_builder.is_none() {
             let mut builder = BooleanBufferBuilder::new(self.len + 1);
@@ -1675,6 +1701,7 @@ impl UnionBuilder {
     }
 
     /// Appends a value to this builder.
+    #[inline]
     pub fn append<T: ArrowPrimitiveType>(
         &mut self,
         type_name: &str,
@@ -1844,6 +1871,7 @@ where
     /// Append a primitive value to the array. Return an existing index
     /// if already present in the values array or a new index if the
     /// value is appended to the values array.
+    #[inline]
     pub fn append(&mut self, value: V::Native) -> Result<K::Native> {
         if let Some(&key) = self.map.get(value.to_byte_slice()) {
             // Append existing value.
@@ -1860,6 +1888,7 @@ where
         }
     }
 
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         self.keys_builder.append_null()
     }
@@ -2047,6 +2076,7 @@ where
         }
     }
 
+    #[inline]
     pub fn append_null(&mut self) -> Result<()> {
         self.keys_builder.append_null()
     }
diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs
index c0073c03b81..65cf30832e2 100644
--- a/rust/arrow/src/array/mod.rs
+++ b/rust/arrow/src/array/mod.rs
@@ -216,6 +216,7 @@ pub use self::builder::BooleanBuilder;
 pub use self::builder::DecimalBuilder;
 pub use self::builder::FixedSizeBinaryBuilder;
 pub use self::builder::FixedSizeListBuilder;
+pub use self::builder::GenericStringBuilder;
 pub use self::builder::LargeBinaryBuilder;
 pub use self::builder::LargeListBuilder;
 pub use self::builder::LargeStringBuilder;
diff --git a/rust/arrow/src/buffer/mutable.rs b/rust/arrow/src/buffer/mutable.rs
index ddc0501f466..3351be7d73d 100644
--- a/rust/arrow/src/buffer/mutable.rs
+++ b/rust/arrow/src/buffer/mutable.rs
@@ -415,6 +415,61 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Creates a [`MutableBuffer`] from a boolean [`Iterator`] with a trusted (upper) length.
+    /// # use arrow::buffer::MutableBuffer;
+    /// # Example
+    /// ```
+    /// # use arrow::buffer::MutableBuffer;
+    /// let v = vec![false, true, false];
+    /// let iter = v.iter().map(|x| *x || true);
+    /// let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(iter) };
+    /// assert_eq!(buffer.len(), 1) // 3 booleans have 1 byte
+    /// ```
+    /// # Safety
+    /// This method assumes that the iterator's size is correct and is undefined behavior
+    /// to use it on an iterator that reports an incorrect length.
+    // This implementation is required for two reasons:
+    // 1. there is no trait `TrustedLen` in stable rust and therefore
+    //    we can't specialize `extend` for `TrustedLen` like `Vec` does.
+    // 2. `from_trusted_len_iter_bool` is faster.
+    pub unsafe fn from_trusted_len_iter_bool<I: Iterator<Item = bool>>(
+        mut iterator: I,
+    ) -> Self {
+        let (_, upper) = iterator.size_hint();
+        let upper = upper.expect("from_trusted_len_iter requires an upper limit");
+
+        let mut result = {
+            let byte_capacity: usize = upper.saturating_add(7) / 8;
+            MutableBuffer::new(byte_capacity)
+        };
+
+        'a: loop {
+            let mut byte_accum: u8 = 0;
+            let mut mask: u8 = 1;
+
+            //collect (up to) 8 bits into a byte
+            while mask != 0 {
+                if let Some(value) = iterator.next() {
+                    byte_accum |= match value {
+                        true => mask,
+                        false => 0,
+                    };
+                    mask <<= 1;
+                } else {
+                    if mask != 1 {
+                        // Add last byte
+                        result.push_unchecked(byte_accum);
+                    }
+                    break 'a;
+                }
+            }
+
+            // Soundness: from_trusted_len
+            result.push_unchecked(byte_accum);
+        }
+        result
+    }
+
     /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length or errors
     /// if any of the items of the iterator is an error.
     /// Prefer this to `collect` whenever possible, as it is faster ~60% faster.
diff --git a/rust/arrow/src/compute/kernels/boolean.rs b/rust/arrow/src/compute/kernels/boolean.rs
index 54f5413b05a..b835d60637f 100644
--- a/rust/arrow/src/compute/kernels/boolean.rs
+++ b/rust/arrow/src/compute/kernels/boolean.rs
@@ -31,7 +31,129 @@ use crate::buffer::{
 use crate::compute::util::combine_option_bitmap;
 use crate::datatypes::{ArrowNumericType, DataType};
 use crate::error::{ArrowError, Result};
-use crate::util::bit_util::ceil;
+use crate::util::bit_util::{ceil, round_upto_multiple_of_64};
+use core::iter;
+use lexical_core::Integer;
+use std::iter::FromIterator;
+
+fn binary_boolean_kleene_kernel<F>(
+    left: &BooleanArray,
+    right: &BooleanArray,
+    op: F,
+) -> Result<BooleanArray>
+where
+    F: Fn(u64, u64, u64, u64) -> (u64, u64),
+{
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(
+            "Cannot perform bitwise operation on arrays of different length".to_string(),
+        ));
+    }
+
+    // length and offset of boolean array is measured in bits
+    let len = left.len();
+    let left_offset = left.offset();
+    let right_offset = right.offset();
+
+    let left_buffer = left.values();
+    let right_buffer = right.values();
+
+    // If we do not have a validity bitmap, we just use an empty buffer
+    let (left_validity, left_validity_len) = left.data_ref().null_buffer().map_or_else(
+        || (Buffer::from_iter(iter::empty::<bool>()), 0),
+        |buffer| (buffer.clone(), len),
+    );
+    let (right_validity, right_validity_len) =
+        right.data_ref().null_buffer().map_or_else(
+            || (Buffer::from_iter(iter::empty::<bool>()), 0),
+            |buffer| (buffer.clone(), len),
+        );
+
+    let left_chunks = left_buffer.bit_chunks(left_offset, len);
+    let left_valid_chunks = left_validity.bit_chunks(left_offset, left_validity_len);
+    let right_chunks = right_buffer.bit_chunks(right_offset, len);
+    let right_valid_chunks = right_validity.bit_chunks(right_offset, right_validity_len);
+
+    // result length measured in bytes (incl. remainder)
+    let mut result_len = round_upto_multiple_of_64(len) / 8;
+    // if remainder is absent, the kleene_op code would always resize the result buffers,
+    // which is both unnecessary and expensive. We can prevent the resizing by always
+    // adding 8 additional bytes to the length of both buffers. All bits of these 8 bytes
+    // will always be 0 though.
+    if left_chunks.remainder_len().is_zero() {
+        result_len += 8;
+    }
+    let mut value_buffer = MutableBuffer::new(result_len);
+    let mut valid_buffer = MutableBuffer::new(result_len);
+
+    let kleene_op = |((left_data, left_valid), (right_data, right_valid)): (
+        (u64, u64),
+        (u64, u64),
+    )| {
+        let left_true = left_valid & left_data;
+        let left_false = left_valid & !left_data;
+
+        let right_true = right_valid & right_data;
+        let right_false = right_valid & !right_data;
+
+        let (value, valid) = op(left_true, left_false, right_true, right_false);
+
+        value_buffer.extend_from_slice(&[value]);
+        valid_buffer.extend_from_slice(&[valid]);
+    };
+
+    // To get rid off the additional remainder logic we would need an iterator
+    // which contains a possible remainder word.
+    let remainder = (
+        (
+            left_chunks.remainder_bits(),
+            left_valid_chunks.remainder_bits(),
+        ),
+        (
+            right_chunks.remainder_bits(),
+            right_valid_chunks.remainder_bits(),
+        ),
+    );
+
+    let base_iter = left_chunks
+        .iter()
+        .zip(left_valid_chunks.iter())
+        .zip(right_chunks.iter().zip(right_valid_chunks.iter()))
+        .chain(iter::once(remainder));
+
+    match (
+        left.data_ref().null_buffer().is_some(),
+        right.data_ref().null_buffer().is_some(),
+    ) {
+        (true, true) => base_iter.for_each(kleene_op),
+        (true, false) => base_iter
+            .map(|(left, (right_data, _))| (left, (right_data, u64::MAX)))
+            .for_each(kleene_op),
+        (false, true) => base_iter
+            .map(|((left_data, _), right)| ((left_data, u64::MAX), right))
+            .for_each(kleene_op),
+        (false, false) => base_iter
+            .map(|((left_data, _), (right_data, _))| {
+                ((left_data, u64::MAX), (right_data, u64::MAX))
+            })
+            .for_each(kleene_op),
+    };
+
+    let bool_buffer: Buffer = value_buffer.into();
+    let bool_valid_buffer: Buffer = valid_buffer.into();
+
+    let array_data = ArrayData::new(
+        DataType::Boolean,
+        len,
+        None,
+        Some(bool_valid_buffer),
+        left_offset,
+        vec![bool_buffer],
+        vec![],
+    );
+
+    Ok(BooleanArray::from(array_data))
+}
 
 /// Helper function to implement binary kernels
 fn binary_boolean_kernel<F>(
@@ -94,6 +216,55 @@ pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray> {
     binary_boolean_kernel(&left, &right, buffer_bin_and)
 }
 
+/// Logical 'and' boolean values with Kleene logic
+///
+/// # Behavior
+///
+/// This function behaves as follows with nulls:
+///
+/// * `true` and `null` = `null`
+/// * `null` and `true` = `null`
+/// * `false` and `null` = `false`
+/// * `null` and `false` = `false`
+/// * `null` and `null` = `null`
+///
+/// In other words, in this context a null value really means \"unknown\",
+/// and an unknown value 'and' false is always false.
+/// For a different null behavior, see function \"and\".
+///
+/// # Example
+///
+/// ```rust
+/// use arrow::array::BooleanArray;
+/// use arrow::error::Result;
+/// use arrow::compute::kernels::boolean::and_kleene;
+/// # fn main() -> Result<()> {
+/// let a = BooleanArray::from(vec![Some(true), Some(false), None]);
+/// let b = BooleanArray::from(vec![None, None, None]);
+/// let and_ab = and_kleene(&a, &b)?;
+/// assert_eq!(and_ab, BooleanArray::from(vec![None, Some(false), None]));
+/// # Ok(())
+/// # }
+/// ```
+///
+/// # Fails
+///
+/// If the operands have different lengths
+pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray> {
+    if left.null_count().is_zero() && right.null_count().is_zero() {
+        return and(left, right);
+    }
+
+    let op = |left_true, left_false, right_true, right_false| {
+        (
+            left_true & right_true,
+            left_false | right_false | (left_true & right_true),
+        )
+    };
+
+    binary_boolean_kleene_kernel(left, right, op)
+}
+
 /// Performs `OR` operation on two arrays. If either left or right value is null then the
 /// result is also null.
 /// # Error
@@ -115,6 +286,55 @@ pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray> {
     binary_boolean_kernel(&left, &right, buffer_bin_or)
 }
 
+/// Logical 'or' boolean values with Kleene logic
+///
+/// # Behavior
+///
+/// This function behaves as follows with nulls:
+///
+/// * `true` or `null` = `true`
+/// * `null` or `true` = `true`
+/// * `false` or `null` = `null`
+/// * `null` or `false` = `null`
+/// * `null` or `null` = `null`
+///
+/// In other words, in this context a null value really means \"unknown\",
+/// and an unknown value 'or' true is always true.
+/// For a different null behavior, see function \"or\".
+///
+/// # Example
+///
+/// ```rust
+/// use arrow::array::BooleanArray;
+/// use arrow::error::Result;
+/// use arrow::compute::kernels::boolean::or_kleene;
+/// # fn main() -> Result<()> {
+/// let a = BooleanArray::from(vec![Some(true), Some(false), None]);
+/// let b = BooleanArray::from(vec![None, None, None]);
+/// let or_ab = or_kleene(&a, &b)?;
+/// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), None, None]));
+/// # Ok(())
+/// # }
+/// ```
+///
+/// # Fails
+///
+/// If the operands have different lengths
+pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray> {
+    if left.null_count().is_zero() && right.null_count().is_zero() {
+        return or(left, right);
+    }
+
+    let op = |left_true, left_false, right_true, right_false| {
+        (
+            left_true | right_true,
+            left_true | right_true | (left_false & right_false),
+        )
+    };
+
+    binary_boolean_kleene_kernel(left, right, op)
+}
+
 /// Performs unary `NOT` operation on an arrays. If value is null then the result is also
 /// null.
 /// # Error
@@ -314,10 +534,9 @@ where
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
     use super::*;
     use crate::array::{ArrayRef, Int32Array};
+    use std::sync::Arc;
 
     #[test]
     fn test_bool_array_and() {
@@ -382,6 +601,181 @@ mod tests {
         assert_eq!(c, expected);
     }
 
+    #[test]
+    fn test_binary_boolean_kleene_kernel() {
+        // the kleene kernel is based on chunking and we want to also create
+        // cases, where the number of values is not a multiple of 64
+        for &value in [true, false].iter() {
+            for &is_valid in [true, false].iter() {
+                for &n in [0usize, 1, 63, 64, 65, 127, 128].iter() {
+                    let a = BooleanArray::from(vec![Some(true); n]);
+                    let b = BooleanArray::from(vec![None; n]);
+
+                    let result = binary_boolean_kleene_kernel(&a, &b, |_, _, _, _| {
+                        let tmp_value = if value { u64::MAX } else { 0 };
+                        let tmp_is_valid = if is_valid { u64::MAX } else { 0 };
+                        (tmp_value, tmp_is_valid)
+                    })
+                    .unwrap();
+
+                    assert_eq!(result.len(), n);
+                    (0..n).for_each(|idx| {
+                        assert_eq!(value, result.value(idx));
+                        assert_eq!(is_valid, result.is_valid(idx));
+                    });
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_bool_array_and_kleene_nulls() {
+        let a = BooleanArray::from(vec![
+            None,
+            None,
+            None,
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(true),
+            Some(true),
+            Some(true),
+        ]);
+        let b = BooleanArray::from(vec![
+            None,
+            Some(false),
+            Some(true),
+            None,
+            Some(false),
+            Some(true),
+            None,
+            Some(false),
+            Some(true),
+        ]);
+        let c = and_kleene(&a, &b).unwrap();
+
+        let expected = BooleanArray::from(vec![
+            None,
+            Some(false),
+            None,
+            Some(false),
+            Some(false),
+            Some(false),
+            None,
+            Some(false),
+            Some(true),
+        ]);
+
+        assert_eq!(c, expected);
+    }
+
+    #[test]
+    fn test_bool_array_or_kleene_nulls() {
+        let a = BooleanArray::from(vec![
+            None,
+            None,
+            None,
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(true),
+            Some(true),
+            Some(true),
+        ]);
+        let b = BooleanArray::from(vec![
+            None,
+            Some(false),
+            Some(true),
+            None,
+            Some(false),
+            Some(true),
+            None,
+            Some(false),
+            Some(true),
+        ]);
+        let c = or_kleene(&a, &b).unwrap();
+
+        let expected = BooleanArray::from(vec![
+            None,
+            None,
+            Some(true),
+            None,
+            Some(false),
+            Some(true),
+            Some(true),
+            Some(true),
+            Some(true),
+        ]);
+
+        assert_eq!(c, expected);
+    }
+
+    #[test]
+    fn test_bool_array_or_kleene_right_sided_nulls() {
+        let a = BooleanArray::from(vec![false, false, false, true, true, true]);
+
+        // ensure null bitmap of a is absent
+        assert!(a.data_ref().null_bitmap().is_none());
+
+        let b = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(false),
+            None,
+        ]);
+
+        // ensure null bitmap of b is present
+        assert!(b.data_ref().null_bitmap().is_some());
+
+        let c = or_kleene(&a, &b).unwrap();
+
+        let expected = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(true),
+            Some(true),
+        ]);
+
+        assert_eq!(c, expected);
+    }
+
+    #[test]
+    fn test_bool_array_or_kleene_left_sided_nulls() {
+        let a = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(false),
+            None,
+        ]);
+
+        // ensure null bitmap of b is absent
+        assert!(a.data_ref().null_bitmap().is_some());
+
+        let b = BooleanArray::from(vec![false, false, false, true, true, true]);
+
+        // ensure null bitmap of a is present
+        assert!(b.data_ref().null_bitmap().is_none());
+
+        let c = or_kleene(&a, &b).unwrap();
+
+        let expected = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(true),
+            Some(true),
+        ]);
+
+        assert_eq!(c, expected);
+    }
+
     #[test]
     fn test_bool_array_not() {
         let a = BooleanArray::from(vec![false, true]);
diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs
index 823c4e46c4d..a770ede21dc 100644
--- a/rust/arrow/src/compute/kernels/comparison.rs
+++ b/rust/arrow/src/compute/kernels/comparison.rs
@@ -46,9 +46,9 @@ macro_rules! compare_op {
         let null_bit_buffer =
             combine_option_bitmap($left.data_ref(), $right.data_ref(), $left.len())?;
 
-        let buffer = (0..$left.len())
-            .map(|i| $op($left.value(i), $right.value(i)))
-            .collect();
+        let comparison = (0..$left.len()).map(|i| $op($left.value(i), $right.value(i)));
+        // same size as $left.len() and $right.len()
+        let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) };
 
         let data = ArrayData::new(
             DataType::Boolean,
@@ -56,7 +56,62 @@ macro_rules! compare_op {
             None,
             null_bit_buffer,
             0,
-            vec![buffer],
+            vec![Buffer::from(buffer)],
+            vec![],
+        );
+        Ok(BooleanArray::from(data))
+    }};
+}
+
+macro_rules! compare_op_primitive {
+    ($left: expr, $right:expr, $op:expr) => {{
+        if $left.len() != $right.len() {
+            return Err(ArrowError::ComputeError(
+                "Cannot perform comparison operation on arrays of different length"
+                    .to_string(),
+            ));
+        }
+
+        let null_bit_buffer =
+            combine_option_bitmap($left.data_ref(), $right.data_ref(), $left.len())?;
+
+        let mut values = MutableBuffer::from_len_zeroed(($left.len() + 7) / 8);
+        let lhs_chunks_iter = $left.values().chunks_exact(8);
+        let lhs_remainder = lhs_chunks_iter.remainder();
+        let rhs_chunks_iter = $right.values().chunks_exact(8);
+        let rhs_remainder = rhs_chunks_iter.remainder();
+        let chunks = $left.len() / 8;
+
+        values[..chunks]
+            .iter_mut()
+            .zip(lhs_chunks_iter)
+            .zip(rhs_chunks_iter)
+            .for_each(|((byte, lhs), rhs)| {
+                lhs.iter()
+                    .zip(rhs.iter())
+                    .enumerate()
+                    .for_each(|(i, (&lhs, &rhs))| {
+                        *byte |= if $op(lhs, rhs) { 1 << i } else { 0 };
+                    });
+            });
+
+        if !lhs_remainder.is_empty() {
+            let last = &mut values[chunks];
+            lhs_remainder
+                .iter()
+                .zip(rhs_remainder.iter())
+                .enumerate()
+                .for_each(|(i, (&lhs, &rhs))| {
+                    *last |= if $op(lhs, rhs) { 1 << i } else { 0 };
+                });
+        };
+        let data = ArrayData::new(
+            DataType::Boolean,
+            $left.len(),
+            None,
+            null_bit_buffer,
+            0,
+            vec![Buffer::from(values)],
             vec![],
         );
         Ok(BooleanArray::from(data))
@@ -67,9 +122,46 @@ macro_rules! compare_op_scalar {
     ($left: expr, $right:expr, $op:expr) => {{
         let null_bit_buffer = $left.data().null_buffer().cloned();
 
-        let buffer = (0..$left.len())
-            .map(|i| $op($left.value(i), $right))
-            .collect();
+        let comparison = (0..$left.len()).map(|i| $op($left.value(i), $right));
+        // same as $left.len()
+        let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) };
+
+        let data = ArrayData::new(
+            DataType::Boolean,
+            $left.len(),
+            None,
+            null_bit_buffer,
+            0,
+            vec![Buffer::from(buffer)],
+            vec![],
+        );
+        Ok(BooleanArray::from(data))
+    }};
+}
+
+macro_rules! compare_op_scalar_primitive {
+    ($left: expr, $right:expr, $op:expr) => {{
+        let null_bit_buffer = $left.data().null_buffer().cloned();
+
+        let mut values = MutableBuffer::from_len_zeroed(($left.len() + 7) / 8);
+        let lhs_chunks_iter = $left.values().chunks_exact(8);
+        let lhs_remainder = lhs_chunks_iter.remainder();
+        let chunks = $left.len() / 8;
+
+        values[..chunks]
+            .iter_mut()
+            .zip(lhs_chunks_iter)
+            .for_each(|(byte, chunk)| {
+                chunk.iter().enumerate().for_each(|(i, &c_i)| {
+                    *byte |= if $op(c_i, $right) { 1 << i } else { 0 };
+                });
+            });
+        if !lhs_remainder.is_empty() {
+            let last = &mut values[chunks];
+            lhs_remainder.iter().enumerate().for_each(|(i, &lhs)| {
+                *last |= if $op(lhs, $right) { 1 << i } else { 0 };
+            });
+        };
 
         let data = ArrayData::new(
             DataType::Boolean,
@@ -77,7 +169,7 @@ macro_rules! compare_op_scalar {
             None,
             null_bit_buffer,
             0,
-            vec![buffer],
+            vec![Buffer::from(values)],
             vec![],
         );
         Ok(BooleanArray::from(data))
@@ -95,7 +187,7 @@ where
     T: ArrowNumericType,
     F: Fn(T::Native, T::Native) -> bool,
 {
-    compare_op!(left, right, op)
+    compare_op_primitive!(left, right, op)
 }
 
 /// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using
@@ -109,7 +201,7 @@ where
     T: ArrowNumericType,
     F: Fn(T::Native, T::Native) -> bool,
 {
-    compare_op_scalar!(left, right, op)
+    compare_op_scalar_primitive!(left, right, op)
 }
 
 /// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
diff --git a/rust/arrow/src/compute/kernels/mod.rs b/rust/arrow/src/compute/kernels/mod.rs
index a8d24979e04..862f55fe2f2 100644
--- a/rust/arrow/src/compute/kernels/mod.rs
+++ b/rust/arrow/src/compute/kernels/mod.rs
@@ -28,6 +28,7 @@ pub mod concat;
 pub mod filter;
 pub mod length;
 pub mod limit;
+pub mod regexp;
 pub mod sort;
 pub mod substring;
 pub mod take;
diff --git a/rust/arrow/src/compute/kernels/regexp.rs b/rust/arrow/src/compute/kernels/regexp.rs
new file mode 100644
index 00000000000..446d71d9f4a
--- /dev/null
+++ b/rust/arrow/src/compute/kernels/regexp.rs
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines kernel to extract substrings based on a regular
+//! expression of a \[Large\]StringArray
+
+use crate::array::{
+    ArrayRef, GenericStringArray, GenericStringBuilder, ListBuilder,
+    StringOffsetSizeTrait,
+};
+use crate::error::{ArrowError, Result};
+use std::collections::HashMap;
+
+use std::sync::Arc;
+
+use regex::Regex;
+
+/// Extract all groups matched by a regular expression for a given String array.
+pub fn regexp_match<OffsetSize: StringOffsetSizeTrait>(
+    array: &GenericStringArray<OffsetSize>,
+    regex_array: &GenericStringArray<OffsetSize>,
+    flags_array: Option<&GenericStringArray<OffsetSize>>,
+) -> Result<ArrayRef> {
+    let mut patterns: HashMap<String, Regex> = HashMap::new();
+    let builder: GenericStringBuilder<OffsetSize> = GenericStringBuilder::new(0);
+    let mut list_builder = ListBuilder::new(builder);
+
+    let complete_pattern = match flags_array {
+        Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
+            |(pattern, flags)| {
+                pattern.map(|pattern| match flags {
+                    Some(value) => format!("(?{}){}", value, pattern),
+                    None => pattern.to_string(),
+                })
+            },
+        )) as Box<dyn Iterator<Item = Option<String>>>,
+        None => Box::new(
+            regex_array
+                .iter()
+                .map(|pattern| pattern.map(|pattern| pattern.to_string())),
+        ),
+    };
+    array
+        .iter()
+        .zip(complete_pattern)
+        .map(|(value, pattern)| {
+            match (value, pattern) {
+                // Required for Postgres compatibility:
+                // SELECT regexp_match('foobarbequebaz', ''); = {""}
+                (Some(_), Some(pattern)) if pattern == *"" => {
+                    list_builder.values().append_value("")?;
+                    list_builder.append(true)?;
+                }
+                (Some(value), Some(pattern)) => {
+                    let existing_pattern = patterns.get(&pattern);
+                    let re = match existing_pattern {
+                        Some(re) => re.clone(),
+                        None => {
+                            let re = Regex::new(pattern.as_str()).map_err(|e| {
+                                ArrowError::ComputeError(format!(
+                                    "Regular expression did not compile: {:?}",
+                                    e
+                                ))
+                            })?;
+                            patterns.insert(pattern, re.clone());
+                            re
+                        }
+                    };
+                    match re.captures(value) {
+                        Some(caps) => {
+                            for m in caps.iter().skip(1) {
+                                if let Some(v) = m {
+                                    list_builder.values().append_value(v.as_str())?;
+                                }
+                            }
+                            list_builder.append(true)?
+                        }
+                        None => list_builder.append(false)?,
+                    }
+                }
+                _ => list_builder.append(false)?,
+            }
+            Ok(())
+        })
+        .collect::<Result<Vec<()>>>()?;
+    Ok(Arc::new(list_builder.finish()))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::array::{ListArray, StringArray};
+
+    #[test]
+    fn match_single_group() -> Result<()> {
+        let values = vec![
+            Some("abc-005-def"),
+            Some("X-7-5"),
+            Some("X545"),
+            None,
+            Some("foobarbequebaz"),
+            Some("foobarbequebaz"),
+        ];
+        let array = StringArray::from(values);
+        let mut pattern_values = vec![r".*-(\d*)-.*"; 4];
+        pattern_values.push(r"(bar)(bequ1e)");
+        pattern_values.push("");
+        let pattern = StringArray::from(pattern_values);
+        let actual = regexp_match(&array, &pattern, None)?;
+        let elem_builder: GenericStringBuilder<i32> = GenericStringBuilder::new(0);
+        let mut expected_builder = ListBuilder::new(elem_builder);
+        expected_builder.values().append_value("005")?;
+        expected_builder.append(true)?;
+        expected_builder.values().append_value("7")?;
+        expected_builder.append(true)?;
+        expected_builder.append(false)?;
+        expected_builder.append(false)?;
+        expected_builder.append(false)?;
+        expected_builder.values().append_value("")?;
+        expected_builder.append(true)?;
+        let expected = expected_builder.finish();
+        let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
+        assert_eq!(&expected, result);
+        Ok(())
+    }
+
+    #[test]
+    fn match_single_group_with_flags() -> Result<()> {
+        let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None];
+        let array = StringArray::from(values);
+        let pattern = StringArray::from(vec![r"x.*-(\d*)-.*"; 4]);
+        let flags = StringArray::from(vec!["i"; 4]);
+        let actual = regexp_match(&array, &pattern, Some(&flags))?;
+        let elem_builder: GenericStringBuilder<i32> = GenericStringBuilder::new(0);
+        let mut expected_builder = ListBuilder::new(elem_builder);
+        expected_builder.append(false)?;
+        expected_builder.values().append_value("7")?;
+        expected_builder.append(true)?;
+        expected_builder.append(false)?;
+        expected_builder.append(false)?;
+        let expected = expected_builder.finish();
+        let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
+        assert_eq!(&expected, result);
+        Ok(())
+    }
+}
diff --git a/rust/arrow/src/compute/mod.rs b/rust/arrow/src/compute/mod.rs
index 9de07388e9c..be1aa277ca4 100644
--- a/rust/arrow/src/compute/mod.rs
+++ b/rust/arrow/src/compute/mod.rs
@@ -29,6 +29,7 @@ pub use self::kernels::comparison::*;
 pub use self::kernels::concat::*;
 pub use self::kernels::filter::*;
 pub use self::kernels::limit::*;
+pub use self::kernels::regexp::*;
 pub use self::kernels::sort::*;
 pub use self::kernels::take::*;
 pub use self::kernels::temporal::*;
diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs
index 861c7080094..6bfa077f4ab 100644
--- a/rust/arrow/src/error.rs
+++ b/rust/arrow/src/error.rs
@@ -17,6 +17,7 @@
 
 //! Defines `ArrowError` for representing failures in various Arrow operations.
 use std::fmt::{Debug, Display, Formatter};
+use std::io::Write;
 
 use csv as csv_crate;
 use std::error::Error;
@@ -90,6 +91,12 @@ impl From<serde_json::Error> for ArrowError {
     }
 }
 
+impl<W: Write> From<::std::io::IntoInnerError<W>> for ArrowError {
+    fn from(error: std::io::IntoInnerError<W>) -> Self {
+        ArrowError::IoError(error.to_string())
+    }
+}
+
 impl Display for ArrowError {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         match self {
diff --git a/rust/arrow/src/ipc/writer.rs b/rust/arrow/src/ipc/writer.rs
index f5a90fa1716..a6df7b8a1eb 100644
--- a/rust/arrow/src/ipc/writer.rs
+++ b/rust/arrow/src/ipc/writer.rs
@@ -546,6 +546,53 @@ impl<W: Write> StreamWriter<W> {
 
         Ok(())
     }
+
+    /// Unwraps the BufWriter housed in StreamWriter.writer, returning the underlying
+    /// writer
+    ///
+    /// The buffer is flushed and the StreamWriter is finished before returning the
+    /// writer.
+    ///
+    /// # Errors
+    ///
+    /// An ['Err'] may be returned if an error occurs while finishing the StreamWriter
+    /// or while flushing the buffer.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use arrow::datatypes::Schema;
+    /// # use arrow::ipc::writer::StreamWriter;
+    /// # use arrow::error::ArrowError;
+    /// # fn main() -> Result<(), ArrowError> {
+    /// // The result we expect from an empty schema
+    /// let expected = vec![
+    ///     255, 255, 255, 255,  64,   0,   0,   0,
+    ///      16,   0,   0,   0,   0,   0,  10,   0,
+    ///      14,   0,  12,   0,  11,   0,   4,   0,
+    ///      10,   0,   0,   0,  20,   0,   0,   0,
+    ///       0,   0,   0,   1,   4,   0,  10,   0,
+    ///      12,   0,   0,   0,   8,   0,   4,   0,
+    ///      10,   0,   0,   0,   8,   0,   0,   0,
+    ///       8,   0,   0,   0,   0,   0,   0,   0,
+    ///       0,   0,   0,   0,   0,   0,   0,   0,
+    ///     255, 255, 255, 255,   0,   0,   0,   0
+    /// ];
+    ///
+    /// let schema = Schema::new(vec![]);
+    /// let buffer: Vec<u8> = Vec::new();
+    /// let stream_writer = StreamWriter::try_new(buffer, &schema)?;
+    ///
+    /// assert_eq!(stream_writer.into_inner()?, expected);
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn into_inner(mut self) -> Result<W> {
+        if !self.finished {
+            self.finish()?;
+        }
+        self.writer.into_inner().map_err(ArrowError::from)
+    }
 }
 
 /// Stores the encoded data, which is an ipc::Message, and optional Arrow data
diff --git a/rust/arrow/src/lib.rs b/rust/arrow/src/lib.rs
index 68a820bfc54..30f968c9979 100644
--- a/rust/arrow/src/lib.rs
+++ b/rust/arrow/src/lib.rs
@@ -129,11 +129,18 @@
 #![cfg_attr(feature = "avx512", feature(avx512_target_feature))]
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]
+#![deny(clippy::redundant_clone)]
+#![allow(
+    // introduced to ignore lint errors when upgrading from 2020-04-22 to 2020-11-14
+    clippy::float_equality_without_abs,
+    clippy::type_complexity,
+    // upper_case_acronyms lint was introduced in Rust 1.51.
+    // It is triggered in the ffi module, and ipc::gen, which we have no control over
+    clippy::upper_case_acronyms,
+    clippy::vec_init_then_push
+)]
 #![allow(bare_trait_objects)]
 #![warn(missing_debug_implementations)]
-#![deny(clippy::redundant_clone)]
-// introduced to ignore lint errors when upgrading from 2020-04-22 to 2020-11-14
-#![allow(clippy::float_equality_without_abs, clippy::type_complexity)]
 
 pub mod alloc;
 mod arch;
diff --git a/rust/arrow/src/util/bench_util.rs b/rust/arrow/src/util/bench_util.rs
index c6401bda8d6..fd0ece830a1 100644
--- a/rust/arrow/src/util/bench_util.rs
+++ b/rust/arrow/src/util/bench_util.rs
@@ -17,12 +17,15 @@
 
 //! Utils to make benchmarking easier
 
-use rand::distributions::{Alphanumeric, Distribution, Standard};
-use rand::Rng;
-
 use crate::array::*;
 use crate::datatypes::*;
 use crate::util::test_util::seedable_rng;
+use rand::Rng;
+use rand::SeedableRng;
+use rand::{
+    distributions::{Alphanumeric, Distribution, Standard},
+    prelude::StdRng,
+};
 
 /// Creates an random (but fixed-seeded) array of a given size and null density
 pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
@@ -43,6 +46,28 @@ where
         .collect()
 }
 
+pub fn create_primitive_array_with_seed<T>(
+    size: usize,
+    null_density: f32,
+    seed: u64,
+) -> PrimitiveArray<T>
+where
+    T: ArrowPrimitiveType,
+    Standard: Distribution<T::Native>,
+{
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    (0..size)
+        .map(|_| {
+            if rng.gen::<f32>() < null_density {
+                None
+            } else {
+                Some(rng.gen())
+            }
+        })
+        .collect()
+}
+
 /// Creates an random (but fixed-seeded) array of a given size and null density
 pub fn create_boolean_array(
     size: usize,
@@ -66,7 +91,10 @@ where
 }
 
 /// Creates an random (but fixed-seeded) array of a given size and null density
-pub fn create_string_array(size: usize, null_density: f32) -> StringArray {
+pub fn create_string_array<Offset: StringOffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+) -> GenericStringArray<Offset> {
     let rng = &mut seedable_rng();
 
     (0..size)
@@ -80,3 +108,48 @@ pub fn create_string_array(size: usize, null_density: f32) -> StringArray {
         })
         .collect()
 }
+
+/// Creates an random (but fixed-seeded) binary array of a given size and null density
+pub fn create_binary_array<Offset: BinaryOffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+) -> GenericBinaryArray<Offset> {
+    let rng = &mut seedable_rng();
+    let range_rng = &mut seedable_rng();
+
+    (0..size)
+        .map(|_| {
+            if rng.gen::<f32>() < null_density {
+                None
+            } else {
+                let value = rng
+                    .sample_iter::<u8, _>(Standard)
+                    .take(range_rng.gen_range(0, 8))
+                    .collect::<Vec<u8>>();
+                Some(value)
+            }
+        })
+        .collect()
+}
+
+/// Creates an random (but fixed-seeded) array of a given size and null density
+pub fn create_fsb_array(
+    size: usize,
+    null_density: f32,
+    value_len: usize,
+) -> FixedSizeBinaryArray {
+    let rng = &mut seedable_rng();
+
+    FixedSizeBinaryArray::try_from_sparse_iter((0..size).map(|_| {
+        if rng.gen::<f32>() < null_density {
+            None
+        } else {
+            let value = rng
+                .sample_iter::<u8, _>(Standard)
+                .take(value_len)
+                .collect::<Vec<u8>>();
+            Some(value)
+        }
+    }))
+    .unwrap()
+}
diff --git a/rust/arrow/src/util/data_gen.rs b/rust/arrow/src/util/data_gen.rs
new file mode 100644
index 00000000000..cd1f25efea0
--- /dev/null
+++ b/rust/arrow/src/util/data_gen.rs
@@ -0,0 +1,347 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities to generate random arrays and batches
+
+use std::{convert::TryFrom, sync::Arc};
+
+use rand::{distributions::uniform::SampleUniform, Rng};
+
+use crate::error::{ArrowError, Result};
+use crate::record_batch::{RecordBatch, RecordBatchOptions};
+use crate::{array::*, datatypes::SchemaRef};
+use crate::{
+    buffer::{Buffer, MutableBuffer},
+    datatypes::*,
+};
+
+use super::{bench_util::*, bit_util, test_util::seedable_rng};
+
+/// Create a random [RecordBatch] from a schema
+pub fn create_random_batch(
+    schema: SchemaRef,
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let columns = schema
+        .fields()
+        .iter()
+        .map(|field| create_random_array(field, size, null_density, true_density))
+        .collect::<Result<Vec<ArrayRef>>>()?;
+
+    RecordBatch::try_new_with_options(
+        schema,
+        columns,
+        &RecordBatchOptions {
+            match_field_names: false,
+        },
+    )
+}
+
+/// Create a random [ArrayRef] from a [DataType] with a length,
+/// null density and true density (for [BooleanArray]).
+pub fn create_random_array(
+    field: &Field,
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<ArrayRef> {
+    // Override null density with 0.0 if the array is non-nullable
+    let null_density = match field.is_nullable() {
+        true => null_density,
+        false => 0.0,
+    };
+    use DataType::*;
+    Ok(match field.data_type() {
+        Null => Arc::new(NullArray::new(size)) as ArrayRef,
+        Boolean => Arc::new(create_boolean_array(size, null_density, true_density)),
+        Int8 => Arc::new(create_primitive_array::<Int8Type>(size, null_density)),
+        Int16 => Arc::new(create_primitive_array::<Int16Type>(size, null_density)),
+        Int32 => Arc::new(create_primitive_array::<Int32Type>(size, null_density)),
+        Int64 => Arc::new(create_primitive_array::<Int64Type>(size, null_density)),
+        UInt8 => Arc::new(create_primitive_array::<UInt8Type>(size, null_density)),
+        UInt16 => Arc::new(create_primitive_array::<UInt16Type>(size, null_density)),
+        UInt32 => Arc::new(create_primitive_array::<UInt32Type>(size, null_density)),
+        UInt64 => Arc::new(create_primitive_array::<UInt64Type>(size, null_density)),
+        Float16 => {
+            return Err(ArrowError::NotYetImplemented(
+                "Float16 is not implememted".to_string(),
+            ))
+        }
+        Float32 => Arc::new(create_primitive_array::<Float32Type>(size, null_density)),
+        Float64 => Arc::new(create_primitive_array::<Float64Type>(size, null_density)),
+        Timestamp(_, _) => {
+            let int64_array =
+                Arc::new(create_primitive_array::<Int64Type>(size, null_density))
+                    as ArrayRef;
+            return crate::compute::cast(&int64_array, field.data_type());
+        }
+        Date32 => Arc::new(create_primitive_array::<Date32Type>(size, null_density)),
+        Date64 => Arc::new(create_primitive_array::<Date64Type>(size, null_density)),
+        Time32(unit) => match unit {
+            TimeUnit::Second => Arc::new(create_primitive_array::<Time32SecondType>(
+                size,
+                null_density,
+            )) as ArrayRef,
+            TimeUnit::Millisecond => Arc::new(create_primitive_array::<
+                Time32MillisecondType,
+            >(size, null_density)),
+            _ => {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Unsupported unit {:?} for Time32",
+                    unit
+                )))
+            }
+        },
+        Time64(unit) => match unit {
+            TimeUnit::Microsecond => Arc::new(create_primitive_array::<
+                Time64MicrosecondType,
+            >(size, null_density)) as ArrayRef,
+            TimeUnit::Nanosecond => Arc::new(create_primitive_array::<
+                Time64NanosecondType,
+            >(size, null_density)),
+            _ => {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Unsupported unit {:?} for Time64",
+                    unit
+                )))
+            }
+        },
+        Utf8 => Arc::new(create_string_array::<i32>(size, null_density)),
+        LargeUtf8 => Arc::new(create_string_array::<i64>(size, null_density)),
+        Binary => Arc::new(create_binary_array::<i32>(size, null_density)),
+        LargeBinary => Arc::new(create_binary_array::<i64>(size, null_density)),
+        FixedSizeBinary(len) => {
+            Arc::new(create_fsb_array(size, null_density, *len as usize))
+        }
+        List(_) => create_random_list_array(field, size, null_density, true_density)?,
+        LargeList(_) => {
+            create_random_list_array(field, size, null_density, true_density)?
+        }
+        Struct(fields) => Arc::new(StructArray::try_from(
+            fields
+                .iter()
+                .map(|struct_field| {
+                    create_random_array(struct_field, size, null_density, true_density)
+                        .map(|array_ref| (struct_field.name().as_str(), array_ref))
+                })
+                .collect::<Result<Vec<(&str, ArrayRef)>>>()?,
+        )?),
+        other => {
+            return Err(ArrowError::NotYetImplemented(format!(
+                "Generating random arrays not yet implemented for {:?}",
+                other
+            )))
+        }
+    })
+}
+
+#[inline]
+fn create_random_list_array(
+    field: &Field,
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<ArrayRef> {
+    // Override null density with 0.0 if the array is non-nullable
+    let null_density = match field.is_nullable() {
+        true => null_density,
+        false => 0.0,
+    };
+    let list_field;
+    let (offsets, child_len) = match field.data_type() {
+        DataType::List(f) => {
+            let (offsets, child_len) = create_random_offsets::<i32>(size, 0, 5);
+            list_field = f;
+            (Buffer::from(offsets.to_byte_slice()), child_len as usize)
+        }
+        DataType::LargeList(f) => {
+            let (offsets, child_len) = create_random_offsets::<i64>(size, 0, 5);
+            list_field = f;
+            (Buffer::from(offsets.to_byte_slice()), child_len as usize)
+        }
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Cannot create list array for field {:?}",
+                field
+            )))
+        }
+    };
+
+    // Create list's child data
+    let child_array =
+        create_random_array(list_field, child_len as usize, null_density, true_density)?;
+    let child_data = child_array.data();
+    // Create list's null buffers, if it is nullable
+    let null_buffer = match field.is_nullable() {
+        true => Some(create_random_null_buffer(size, null_density)),
+        false => None,
+    };
+    let list_data = ArrayData::new(
+        field.data_type().clone(),
+        size,
+        None,
+        null_buffer,
+        0,
+        vec![offsets],
+        vec![child_data.clone()],
+    );
+    Ok(make_array(list_data))
+}
+
+/// Generate random offsets for list arrays
+fn create_random_offsets<T: OffsetSizeTrait + SampleUniform>(
+    size: usize,
+    min: T,
+    max: T,
+) -> (Vec<T>, T) {
+    let rng = &mut seedable_rng();
+
+    let mut current_offset = T::zero();
+
+    let mut offsets = Vec::with_capacity(size + 1);
+    offsets.push(current_offset);
+
+    (0..size).for_each(|_| {
+        current_offset += rng.gen_range(min, max);
+        offsets.push(current_offset);
+    });
+
+    (offsets, current_offset)
+}
+
+fn create_random_null_buffer(size: usize, null_density: f32) -> Buffer {
+    let mut rng = seedable_rng();
+    let mut mut_buf = MutableBuffer::new_null(size);
+    {
+        let mut_slice = mut_buf.as_slice_mut();
+        (0..size).for_each(|i| {
+            if rng.gen::<f32>() >= null_density {
+                bit_util::set_bit(mut_slice, i)
+            }
+        })
+    };
+    mut_buf.into()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_create_batch() {
+        let size = 32;
+        let fields = vec![Field::new("a", DataType::Int32, true)];
+        let schema = Schema::new(fields);
+        let schema_ref = Arc::new(schema);
+        let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap();
+
+        assert_eq!(batch.schema(), schema_ref);
+        assert_eq!(batch.num_columns(), schema_ref.fields().len());
+        for array in batch.columns() {
+            assert_eq!(array.len(), size);
+        }
+    }
+
+    #[test]
+    fn test_create_batch_non_null() {
+        let size = 32;
+        let fields = vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new(
+                "b",
+                DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, true))),
+                false,
+            ),
+            Field::new("a", DataType::Int32, false),
+        ];
+        let schema = Schema::new(fields);
+        let schema_ref = Arc::new(schema);
+        let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap();
+
+        assert_eq!(batch.schema(), schema_ref);
+        assert_eq!(batch.num_columns(), schema_ref.fields().len());
+        for array in batch.columns() {
+            assert_eq!(array.null_count(), 0);
+        }
+        // Test that the list's child values are non-null
+        let b_array = batch.column(1);
+        let list_array = b_array.as_any().downcast_ref::<ListArray>().unwrap();
+        let child_array = make_array(list_array.data().child_data()[0].clone());
+        assert_eq!(child_array.null_count(), 0);
+        // There should be more values than the list, to show that it's a list
+        assert!(child_array.len() > list_array.len());
+    }
+
+    #[test]
+    fn test_create_struct_array() {
+        let size = 32;
+        let struct_fields = vec![
+            Field::new("b", DataType::Boolean, true),
+            Field::new(
+                "c",
+                DataType::LargeList(Box::new(Field::new(
+                    "item",
+                    DataType::List(Box::new(Field::new(
+                        "item",
+                        DataType::FixedSizeBinary(6),
+                        true,
+                    ))),
+                    false,
+                ))),
+                true,
+            ),
+            Field::new(
+                "d",
+                DataType::Struct(vec![
+                    Field::new("d_x", DataType::Int32, true),
+                    Field::new("d_y", DataType::Float32, false),
+                    Field::new("d_z", DataType::Binary, true),
+                ]),
+                true,
+            ),
+        ];
+        let field = Field::new("struct", DataType::Struct(struct_fields), true);
+        let array = create_random_array(&field, size, 0.2, 0.5).unwrap();
+
+        assert_eq!(array.len(), 32);
+        let struct_array = array.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(struct_array.columns().len(), 3);
+
+        // Test that the nested list makes sense,
+        // i.e. its children's values are more than the parent, to show repetition
+        let col_c = struct_array.column_by_name("c").unwrap();
+        let col_c = col_c.as_any().downcast_ref::<LargeListArray>().unwrap();
+        assert_eq!(col_c.len(), size);
+        let col_c_values = col_c.values();
+        assert!(col_c_values.len() > size);
+        // col_c_values should be a list
+        let col_c_list = col_c_values.as_any().downcast_ref::<ListArray>().unwrap();
+        // Its values should be FixedSizeBinary(6)
+        let fsb = col_c_list.values();
+        assert_eq!(fsb.data_type(), &DataType::FixedSizeBinary(6));
+        assert!(fsb.len() > col_c_list.len());
+
+        // Test nested struct
+        let col_d = struct_array.column_by_name("d").unwrap();
+        let col_d = col_d.as_any().downcast_ref::<StructArray>().unwrap();
+        let col_d_y = col_d.column_by_name("d_y").unwrap();
+        assert_eq!(col_d_y.data_type(), &DataType::Float32);
+        assert_eq!(col_d_y.null_count(), 0);
+    }
+}
diff --git a/rust/arrow/src/util/mod.rs b/rust/arrow/src/util/mod.rs
index c0b5a3e8b46..b2fd4f78661 100644
--- a/rust/arrow/src/util/mod.rs
+++ b/rust/arrow/src/util/mod.rs
@@ -18,6 +18,7 @@
 pub mod bench_util;
 pub mod bit_chunk_iterator;
 pub mod bit_util;
+pub mod data_gen;
 pub mod display;
 pub mod integration_util;
 #[cfg(feature = "prettyprint")]
diff --git a/rust/arrow/src/util/pretty.rs b/rust/arrow/src/util/pretty.rs
index 7baf5590dbd..f354899c1df 100644
--- a/rust/arrow/src/util/pretty.rs
+++ b/rust/arrow/src/util/pretty.rs
@@ -93,8 +93,7 @@ fn create_column(field: &str, columns: &[ArrayRef]) -> Result<Table> {
 
     for col in columns {
         for row in 0..col.len() {
-            let mut cells = Vec::new();
-            cells.push(Cell::new(&array_value_to_string(&col, row)?));
+            let cells = vec![Cell::new(&array_value_to_string(&col, row)?)];
             table.add_row(Row::new(cells));
         }
     }
diff --git a/rust/benchmarks/src/bin/nyctaxi.rs b/rust/benchmarks/src/bin/nyctaxi.rs
index 3391c38c8d3..005efca9488 100644
--- a/rust/benchmarks/src/bin/nyctaxi.rs
+++ b/rust/benchmarks/src/bin/nyctaxi.rs
@@ -51,7 +51,7 @@ struct Opt {
     concurrency: usize,
 
     /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size", default_value = "4096")]
+    #[structopt(short = "s", long = "batch-size", default_value = "8192")]
     batch_size: usize,
 
     /// Path to data files
diff --git a/rust/benchmarks/src/bin/tpch.rs b/rust/benchmarks/src/bin/tpch.rs
index b0a6cedd172..46f4516da1a 100644
--- a/rust/benchmarks/src/bin/tpch.rs
+++ b/rust/benchmarks/src/bin/tpch.rs
@@ -63,7 +63,7 @@ struct BenchmarkOpt {
     concurrency: usize,
 
     /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size", default_value = "32768")]
+    #[structopt(short = "s", long = "batch-size", default_value = "8192")]
     batch_size: usize,
 
     /// Path to data files
@@ -106,7 +106,7 @@ struct ConvertOpt {
     partitions: usize,
 
     /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size", default_value = "4096")]
+    #[structopt(short = "s", long = "batch-size", default_value = "8192")]
     batch_size: usize,
 }
 
@@ -157,9 +157,9 @@ async fn benchmark(opt: BenchmarkOpt) -> Result<Vec<arrow::record_batch::RecordB
                 table,
                 start.elapsed().as_millis()
             );
-            ctx.register_table(table, Arc::new(memtable));
+            ctx.register_table(*table, Arc::new(memtable))?;
         } else {
-            ctx.register_table(table, table_provider);
+            ctx.register_table(*table, table_provider)?;
         }
     }
 
@@ -1105,7 +1105,7 @@ fn get_table(
     table: &str,
     table_format: &str,
     max_concurrency: usize,
-) -> Result<Arc<dyn TableProvider + Send + Sync>> {
+) -> Result<Arc<dyn TableProvider>> {
     match table_format {
         // dbgen creates .tbl ('|' delimited) files without header
         "tbl" => {
@@ -1614,7 +1614,7 @@ mod tests {
 
             let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch]])?;
 
-            ctx.register_table(table, Arc::new(provider));
+            ctx.register_table(table, Arc::new(provider))?;
         }
 
         let plan = create_logical_plan(&mut ctx, n)?;
@@ -1636,7 +1636,7 @@ mod tests {
                 .file_extension(".out");
             let df = ctx.read_csv(&format!("{}/answers/q{}.out", path, n), options)?;
             let df = df.select(
-                &get_answer_schema(n)
+                get_answer_schema(n)
                     .fields()
                     .iter()
                     .map(|field| {
@@ -1658,7 +1658,7 @@ mod tests {
                 debug: false,
                 iterations: 1,
                 concurrency: 2,
-                batch_size: 4096,
+                batch_size: 8192,
                 path: PathBuf::from(path.to_string()),
                 file_format: "tbl".to_string(),
                 mem_table: false,
diff --git a/c_glib/Makefile.am b/rust/datafusion-examples/Cargo.toml
similarity index 55%
rename from c_glib/Makefile.am
rename to rust/datafusion-examples/Cargo.toml
index 35a4c61fc01..c86e7ccbe3c 100644
--- a/c_glib/Makefile.am
+++ b/rust/datafusion-examples/Cargo.toml
@@ -15,27 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
+[package]
+name = "datafusion-examples"
+description = "DataFusion usage examples"
+version = "4.0.0-SNAPSHOT"
+homepage = "https://github.com/apache/arrow"
+repository = "https://github.com/apache/arrow"
+authors = ["Apache Arrow <dev@arrow.apache.org>"]
+license = "Apache-2.0"
+keywords = [ "arrow", "query", "sql" ]
+edition = "2018"
+publish = false
 
-SUBDIRS =					\
-	arrow-glib				\
-	arrow-cuda-glib				\
-	arrow-dataset-glib			\
-	gandiva-glib				\
-	parquet-glib				\
-	plasma-glib				\
-	doc					\
-	example
 
-EXTRA_DIST =					\
-	Gemfile					\
-	README.md				\
-	autogen.sh				\
-	meson.build				\
-	meson_options.txt			\
-	test
-
-arrow_glib_docdir = ${datarootdir}/doc/arrow-glib
-arrow_glib_doc_DATA =				\
-	../LICENSE.txt				\
-	README.md
+[dev-dependencies]
+datafusion = { path = "../datafusion" }
+arrow = { path = "../arrow" }
+prost = "0.7"
+arrow-flight = { path = "../arrow-flight" }
+tonic = "0.4"
+tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
+futures = "0.3"
+num_cpus = "1.13.0"
diff --git a/rust/datafusion/examples/README.md b/rust/datafusion-examples/examples/README.md
similarity index 100%
rename from rust/datafusion/examples/README.md
rename to rust/datafusion-examples/examples/README.md
diff --git a/rust/datafusion/examples/csv_sql.rs b/rust/datafusion-examples/examples/csv_sql.rs
similarity index 100%
rename from rust/datafusion/examples/csv_sql.rs
rename to rust/datafusion-examples/examples/csv_sql.rs
diff --git a/rust/datafusion/examples/dataframe.rs b/rust/datafusion-examples/examples/dataframe.rs
similarity index 100%
rename from rust/datafusion/examples/dataframe.rs
rename to rust/datafusion-examples/examples/dataframe.rs
diff --git a/rust/datafusion/examples/dataframe_in_memory.rs b/rust/datafusion-examples/examples/dataframe_in_memory.rs
similarity index 97%
rename from rust/datafusion/examples/dataframe_in_memory.rs
rename to rust/datafusion-examples/examples/dataframe_in_memory.rs
index 28414bf8700..de8552a3bba 100644
--- a/rust/datafusion/examples/dataframe_in_memory.rs
+++ b/rust/datafusion-examples/examples/dataframe_in_memory.rs
@@ -49,7 +49,7 @@ async fn main() -> Result<()> {
 
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     let provider = MemTable::try_new(schema, vec![vec![batch]])?;
-    ctx.register_table("t", Arc::new(provider));
+    ctx.register_table("t", Arc::new(provider))?;
     let df = ctx.table("t")?;
 
     // construct an expression corresponding to "SELECT a, b FROM t WHERE b = 10" in SQL
diff --git a/rust/datafusion/examples/flight_client.rs b/rust/datafusion-examples/examples/flight_client.rs
similarity index 100%
rename from rust/datafusion/examples/flight_client.rs
rename to rust/datafusion-examples/examples/flight_client.rs
diff --git a/rust/datafusion/examples/flight_server.rs b/rust/datafusion-examples/examples/flight_server.rs
similarity index 100%
rename from rust/datafusion/examples/flight_server.rs
rename to rust/datafusion-examples/examples/flight_server.rs
diff --git a/rust/datafusion/examples/parquet_sql.rs b/rust/datafusion-examples/examples/parquet_sql.rs
similarity index 100%
rename from rust/datafusion/examples/parquet_sql.rs
rename to rust/datafusion-examples/examples/parquet_sql.rs
diff --git a/rust/datafusion/examples/simple_udaf.rs b/rust/datafusion-examples/examples/simple_udaf.rs
similarity index 98%
rename from rust/datafusion/examples/simple_udaf.rs
rename to rust/datafusion-examples/examples/simple_udaf.rs
index a36d200235a..8086dfc47de 100644
--- a/rust/datafusion/examples/simple_udaf.rs
+++ b/rust/datafusion-examples/examples/simple_udaf.rs
@@ -48,7 +48,7 @@ fn create_context() -> Result<ExecutionContext> {
 
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     let provider = MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?;
-    ctx.register_table("t", Arc::new(provider));
+    ctx.register_table("t", Arc::new(provider))?;
     Ok(ctx)
 }
 
@@ -148,7 +148,7 @@ async fn main() -> Result<()> {
     let df = ctx.table("t")?;
 
     // perform the aggregation
-    let df = df.aggregate(&[], &[geometric_mean.call(vec![col("a")])])?;
+    let df = df.aggregate(vec![], vec![geometric_mean.call(vec![col("a")])])?;
 
     // note that "a" is f32, not f64. DataFusion coerces it to match the UDAF's signature.
 
diff --git a/rust/datafusion/examples/simple_udf.rs b/rust/datafusion-examples/examples/simple_udf.rs
similarity index 98%
rename from rust/datafusion/examples/simple_udf.rs
rename to rust/datafusion-examples/examples/simple_udf.rs
index d49aac48527..bfef1089a63 100644
--- a/rust/datafusion/examples/simple_udf.rs
+++ b/rust/datafusion-examples/examples/simple_udf.rs
@@ -50,7 +50,7 @@ fn create_context() -> Result<ExecutionContext> {
 
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     let provider = MemTable::try_new(schema, vec![vec![batch]])?;
-    ctx.register_table("t", Arc::new(provider));
+    ctx.register_table("t", Arc::new(provider))?;
     Ok(ctx)
 }
 
@@ -133,7 +133,7 @@ async fn main() -> Result<()> {
     let expr1 = pow.call(vec![col("a"), col("b")]);
 
     // equivalent to `'SELECT pow(a, b), pow(a, b) AS pow1 FROM t'`
-    let df = df.select(&[
+    let df = df.select(vec![
         expr,
         // alias so that they have different column names
         expr1.alias("pow1"),
diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml
index 94e56a41e7d..f78550e77cf 100644
--- a/rust/datafusion/Cargo.toml
+++ b/rust/datafusion/Cargo.toml
@@ -70,14 +70,12 @@ ordered-float = "2.0"
 unicode-segmentation = { version = "^1.7.1", optional = true }
 regex = { version = "^1.4.3", optional = true }
 lazy_static = { version = "^1.4.0", optional = true }
+smallvec = { version = "1.6", features = ["union"] }
 
 [dev-dependencies]
 rand = "0.8"
 criterion = "0.3"
 tempfile = "3"
-prost = "0.7"
-arrow-flight = { path = "../arrow-flight", version = "4.0.0-SNAPSHOT" }
-tonic = "0.4"
 doc-comment = "0.3"
 
 [[bench]]
diff --git a/rust/datafusion/README.md b/rust/datafusion/README.md
index 4dd0c3e3f7e..b7f1e0a0d97 100644
--- a/rust/datafusion/README.md
+++ b/rust/datafusion/README.md
@@ -100,7 +100,7 @@ async fn main() -> datafusion::error::Result<()> {
   let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
 
   let df = df.filter(col("a").lt_eq(col("b")))?
-           .aggregate(&[col("a")], &[min(col("b"))])?
+           .aggregate(vec![col("a")], vec![min(col("b"))])?
            .limit(100)?;
 
   // execute and print results
@@ -201,6 +201,12 @@ DataFusion also includes a simple command-line interactive SQL utility. See the
 - [ ] Nested types
 - [ ] Lists
 - [x] Subqueries
+- [x] Common table expressions
+- [ ] Set Operations
+  - [x] UNION ALL
+  - [ ] UNION
+  - [ ] INTERSECT
+  - [ ] MINUS
 - [x] Joins
   - [x] INNER JOIN
   - [ ] CROSS JOIN
@@ -239,12 +245,18 @@ This library currently supports many SQL constructs, including
 * `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`
 * `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST`
 
+
 ## Supported Functions
 
 DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible.
 
 Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations.
 
+## Information Schema
+
+DataFusion supports the `TABLES` and `COLUMNS` views of the ISO SQL `information_schema` schema to list tables and columns respectively. More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)).
+
+
 ## Supported Data Types
 
 DataFusion uses Arrow, and thus the Arrow type system, for query
diff --git a/rust/datafusion/benches/aggregate_query_sql.rs b/rust/datafusion/benches/aggregate_query_sql.rs
index 75d9d3432ba..8f1a97e198d 100644
--- a/rust/datafusion/benches/aggregate_query_sql.rs
+++ b/rust/datafusion/benches/aggregate_query_sql.rs
@@ -150,7 +150,7 @@ fn create_context(
 
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     let provider = MemTable::try_new(schema, partitions)?;
-    ctx.register_table("t", Arc::new(provider));
+    ctx.register_table("t", Arc::new(provider))?;
 
     Ok(Arc::new(Mutex::new(ctx)))
 }
diff --git a/rust/datafusion/benches/filter_query_sql.rs b/rust/datafusion/benches/filter_query_sql.rs
index 363ae416f67..8600bdc88c6 100644
--- a/rust/datafusion/benches/filter_query_sql.rs
+++ b/rust/datafusion/benches/filter_query_sql.rs
@@ -62,7 +62,7 @@ fn create_context(array_len: usize, batch_size: usize) -> Result<ExecutionContex
 
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     let provider = MemTable::try_new(schema, vec![batches])?;
-    ctx.register_table("t", Arc::new(provider));
+    ctx.register_table("t", Arc::new(provider))?;
 
     Ok(ctx)
 }
diff --git a/rust/datafusion/benches/math_query_sql.rs b/rust/datafusion/benches/math_query_sql.rs
index 6cc0a1b466a..1aaa2d3403c 100644
--- a/rust/datafusion/benches/math_query_sql.rs
+++ b/rust/datafusion/benches/math_query_sql.rs
@@ -72,7 +72,7 @@ fn create_context(
 
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     let provider = MemTable::try_new(schema, vec![batches])?;
-    ctx.register_table("t", Arc::new(provider));
+    ctx.register_table("t", Arc::new(provider))?;
 
     Ok(Arc::new(Mutex::new(ctx)))
 }
diff --git a/rust/datafusion/benches/sort_limit_query_sql.rs b/rust/datafusion/benches/sort_limit_query_sql.rs
index 34e7fe6c3d0..be065f32e00 100644
--- a/rust/datafusion/benches/sort_limit_query_sql.rs
+++ b/rust/datafusion/benches/sort_limit_query_sql.rs
@@ -81,7 +81,8 @@ fn create_context() -> Arc<Mutex<ExecutionContext>> {
         // create local execution context
         let mut ctx = ExecutionContext::new();
         ctx.state.lock().unwrap().config.concurrency = 1;
-        ctx.register_table("aggregate_test_100", Arc::new(mem_table));
+        ctx.register_table("aggregate_test_100", Arc::new(mem_table))
+            .unwrap();
         ctx_holder.lock().unwrap().push(Arc::new(Mutex::new(ctx)))
     });
 
diff --git a/rust/datafusion/src/bin/repl.rs b/rust/datafusion/src/bin/repl.rs
index 00e42615b90..a6aec204c0d 100644
--- a/rust/datafusion/src/bin/repl.rs
+++ b/rust/datafusion/src/bin/repl.rs
@@ -61,8 +61,11 @@ pub async fn main() {
         .map(|size| size.parse::<usize>().unwrap())
         .unwrap_or(1_048_576);
 
-    let mut ctx =
-        ExecutionContext::with_config(ExecutionConfig::new().with_batch_size(batch_size));
+    let mut ctx = ExecutionContext::with_config(
+        ExecutionConfig::new()
+            .with_batch_size(batch_size)
+            .with_information_schema(true),
+    );
 
     let mut rl = Editor::<()>::new();
     rl.load_history(".history").ok();
diff --git a/rust/datafusion/src/catalog/catalog.rs b/rust/datafusion/src/catalog/catalog.rs
new file mode 100644
index 00000000000..30fea1f45f2
--- /dev/null
+++ b/rust/datafusion/src/catalog/catalog.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Describes the interface and built-in implementations of catalogs,
+//! representing collections of named schemas.
+
+use crate::catalog::schema::SchemaProvider;
+use std::any::Any;
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+
+/// Represent a list of named catalogs
+pub trait CatalogList: Sync + Send {
+    /// Returns the catalog list as [`Any`](std::any::Any)
+    /// so that it can be downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Adds a new catalog to this catalog list
+    /// If a catalog of the same name existed before, it is replaced in the list and returned.
+    fn register_catalog(
+        &self,
+        name: String,
+        catalog: Arc<dyn CatalogProvider>,
+    ) -> Option<Arc<dyn CatalogProvider>>;
+
+    /// Retrieves the list of available catalog names
+    fn catalog_names(&self) -> Vec<String>;
+
+    /// Retrieves a specific catalog by name, provided it exists.
+    fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>>;
+}
+
+/// Simple in-memory list of catalogs
+pub struct MemoryCatalogList {
+    /// Collection of catalogs containing schemas and ultimately TableProviders
+    pub catalogs: RwLock<HashMap<String, Arc<dyn CatalogProvider>>>,
+}
+
+impl MemoryCatalogList {
+    /// Instantiates a new `MemoryCatalogList` with an empty collection of catalogs
+    pub fn new() -> Self {
+        Self {
+            catalogs: RwLock::new(HashMap::new()),
+        }
+    }
+}
+
+impl CatalogList for MemoryCatalogList {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn register_catalog(
+        &self,
+        name: String,
+        catalog: Arc<dyn CatalogProvider>,
+    ) -> Option<Arc<dyn CatalogProvider>> {
+        let mut catalogs = self.catalogs.write().unwrap();
+        catalogs.insert(name, catalog)
+    }
+
+    fn catalog_names(&self) -> Vec<String> {
+        let catalogs = self.catalogs.read().unwrap();
+        catalogs.keys().map(|s| s.to_string()).collect()
+    }
+
+    fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>> {
+        let catalogs = self.catalogs.read().unwrap();
+        catalogs.get(name).cloned()
+    }
+}
+
+/// Represents a catalog, comprising a number of named schemas.
+pub trait CatalogProvider: Sync + Send {
+    /// Returns the catalog provider as [`Any`](std::any::Any)
+    /// so that it can be downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Retrieves the list of available schema names in this catalog.
+    fn schema_names(&self) -> Vec<String>;
+
+    /// Retrieves a specific schema from the catalog by name, provided it exists.
+    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>>;
+}
+
+/// Simple in-memory implementation of a catalog.
+pub struct MemoryCatalogProvider {
+    schemas: RwLock<HashMap<String, Arc<dyn SchemaProvider>>>,
+}
+
+impl MemoryCatalogProvider {
+    /// Instantiates a new MemoryCatalogProvider with an empty collection of schemas.
+    pub fn new() -> Self {
+        Self {
+            schemas: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Adds a new schema to this catalog.
+    /// If a schema of the same name existed before, it is replaced in the catalog and returned.
+    pub fn register_schema(
+        &self,
+        name: impl Into<String>,
+        schema: Arc<dyn SchemaProvider>,
+    ) -> Option<Arc<dyn SchemaProvider>> {
+        let mut schemas = self.schemas.write().unwrap();
+        schemas.insert(name.into(), schema)
+    }
+}
+
+impl CatalogProvider for MemoryCatalogProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema_names(&self) -> Vec<String> {
+        let schemas = self.schemas.read().unwrap();
+        schemas.keys().cloned().collect()
+    }
+
+    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
+        let schemas = self.schemas.read().unwrap();
+        schemas.get(name).cloned()
+    }
+}
diff --git a/rust/datafusion/src/catalog/information_schema.rs b/rust/datafusion/src/catalog/information_schema.rs
new file mode 100644
index 00000000000..65f051c2afc
--- /dev/null
+++ b/rust/datafusion/src/catalog/information_schema.rs
@@ -0,0 +1,488 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Implements the SQL [Information Schema] for DataFusion.
+//!
+//! Information Schema](https://en.wikipedia.org/wiki/Information_schema)
+
+use std::{any, sync::Arc};
+
+use arrow::{
+    array::{StringBuilder, UInt64Builder},
+    datatypes::{DataType, Field, Schema},
+    record_batch::RecordBatch,
+};
+
+use crate::datasource::{MemTable, TableProvider};
+
+use super::{
+    catalog::{CatalogList, CatalogProvider},
+    schema::SchemaProvider,
+};
+
+const INFORMATION_SCHEMA: &str = "information_schema";
+const TABLES: &str = "tables";
+const COLUMNS: &str = "columns";
+
+/// Wraps another [`CatalogProvider`] and adds a "information_schema"
+/// schema that can introspect on tables in the catalog_list
+pub(crate) struct CatalogWithInformationSchema {
+    catalog_list: Arc<dyn CatalogList>,
+    /// wrapped provider
+    inner: Arc<dyn CatalogProvider>,
+}
+
+impl CatalogWithInformationSchema {
+    pub(crate) fn new(
+        catalog_list: Arc<dyn CatalogList>,
+        inner: Arc<dyn CatalogProvider>,
+    ) -> Self {
+        Self {
+            catalog_list,
+            inner,
+        }
+    }
+}
+
+impl CatalogProvider for CatalogWithInformationSchema {
+    fn as_any(&self) -> &dyn any::Any {
+        self
+    }
+
+    fn schema_names(&self) -> Vec<String> {
+        self.inner
+            .schema_names()
+            .into_iter()
+            .chain(std::iter::once(INFORMATION_SCHEMA.to_string()))
+            .collect::<Vec<String>>()
+    }
+
+    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
+        if name.eq_ignore_ascii_case(INFORMATION_SCHEMA) {
+            Some(Arc::new(InformationSchemaProvider {
+                catalog_list: self.catalog_list.clone(),
+            }))
+        } else {
+            self.inner.schema(name)
+        }
+    }
+}
+
+/// Implements the `information_schema` virtual schema and tables
+///
+/// The underlying tables in the `information_schema` are created on
+/// demand. This means that if more tables are added to the underlying
+/// providers, they will appear the next time the `information_schema`
+/// table is queried.
+struct InformationSchemaProvider {
+    catalog_list: Arc<dyn CatalogList>,
+}
+
+impl InformationSchemaProvider {
+    /// Construct the `information_schema.tables` virtual table
+    fn make_tables(&self) -> Arc<dyn TableProvider> {
+        // create a mem table with the names of tables
+        let mut builder = InformationSchemaTablesBuilder::new();
+
+        for catalog_name in self.catalog_list.catalog_names() {
+            let catalog = self.catalog_list.catalog(&catalog_name).unwrap();
+
+            for schema_name in catalog.schema_names() {
+                if schema_name != INFORMATION_SCHEMA {
+                    let schema = catalog.schema(&schema_name).unwrap();
+                    for table_name in schema.table_names() {
+                        builder.add_base_table(&catalog_name, &schema_name, table_name)
+                    }
+                }
+            }
+
+            // Add a final list for the information schema tables themselves
+            builder.add_system_table(&catalog_name, INFORMATION_SCHEMA, TABLES);
+            builder.add_system_table(&catalog_name, INFORMATION_SCHEMA, COLUMNS);
+        }
+
+        let mem_table = builder.build();
+
+        Arc::new(mem_table)
+    }
+
+    /// Construct the `information_schema.columns` virtual table
+    fn make_columns(&self) -> Arc<dyn TableProvider> {
+        let mut builder = InformationSchemaColumnsBuilder::new();
+
+        for catalog_name in self.catalog_list.catalog_names() {
+            let catalog = self.catalog_list.catalog(&catalog_name).unwrap();
+
+            for schema_name in catalog.schema_names() {
+                if schema_name != INFORMATION_SCHEMA {
+                    let schema = catalog.schema(&schema_name).unwrap();
+                    for table_name in schema.table_names() {
+                        let table = schema.table(&table_name).unwrap();
+                        for (i, field) in table.schema().fields().iter().enumerate() {
+                            builder.add_column(
+                                &catalog_name,
+                                &schema_name,
+                                &table_name,
+                                field.name(),
+                                i,
+                                field.is_nullable(),
+                                field.data_type(),
+                            )
+                        }
+                    }
+                }
+            }
+        }
+
+        let mem_table = builder.build();
+
+        Arc::new(mem_table)
+    }
+}
+
+impl SchemaProvider for InformationSchemaProvider {
+    fn as_any(&self) -> &(dyn any::Any + 'static) {
+        self
+    }
+
+    fn table_names(&self) -> Vec<String> {
+        vec![TABLES.to_string(), COLUMNS.to_string()]
+    }
+
+    fn table(&self, name: &str) -> Option<Arc<dyn TableProvider>> {
+        if name.eq_ignore_ascii_case("tables") {
+            Some(self.make_tables())
+        } else if name.eq_ignore_ascii_case("columns") {
+            Some(self.make_columns())
+        } else {
+            None
+        }
+    }
+}
+
+/// Builds the `information_schema.TABLE` table row by row
+///
+/// Columns are based on https://www.postgresql.org/docs/current/infoschema-columns.html
+struct InformationSchemaTablesBuilder {
+    catalog_names: StringBuilder,
+    schema_names: StringBuilder,
+    table_names: StringBuilder,
+    table_types: StringBuilder,
+}
+
+impl InformationSchemaTablesBuilder {
+    fn new() -> Self {
+        // StringBuilder requires providing an initial capacity, so
+        // pick 10 here arbitrarily as this is not performance
+        // critical code and the number of tables is unavailable here.
+        let default_capacity = 10;
+        Self {
+            catalog_names: StringBuilder::new(default_capacity),
+            schema_names: StringBuilder::new(default_capacity),
+            table_names: StringBuilder::new(default_capacity),
+            table_types: StringBuilder::new(default_capacity),
+        }
+    }
+
+    fn add_base_table(
+        &mut self,
+        catalog_name: impl AsRef<str>,
+        schema_name: impl AsRef<str>,
+        table_name: impl AsRef<str>,
+    ) {
+        // Note: append_value is actually infallable.
+        self.catalog_names
+            .append_value(catalog_name.as_ref())
+            .unwrap();
+        self.schema_names
+            .append_value(schema_name.as_ref())
+            .unwrap();
+        self.table_names.append_value(table_name.as_ref()).unwrap();
+        self.table_types.append_value("BASE TABLE").unwrap();
+    }
+
+    fn add_system_table(
+        &mut self,
+        catalog_name: impl AsRef<str>,
+        schema_name: impl AsRef<str>,
+        table_name: impl AsRef<str>,
+    ) {
+        // Note: append_value is actually infallable.
+        self.catalog_names
+            .append_value(catalog_name.as_ref())
+            .unwrap();
+        self.schema_names
+            .append_value(schema_name.as_ref())
+            .unwrap();
+        self.table_names.append_value(table_name.as_ref()).unwrap();
+        self.table_types.append_value("VIEW").unwrap();
+    }
+
+    fn build(self) -> MemTable {
+        let schema = Schema::new(vec![
+            Field::new("table_catalog", DataType::Utf8, false),
+            Field::new("table_schema", DataType::Utf8, false),
+            Field::new("table_name", DataType::Utf8, false),
+            Field::new("table_type", DataType::Utf8, false),
+        ]);
+
+        let Self {
+            mut catalog_names,
+            mut schema_names,
+            mut table_names,
+            mut table_types,
+        } = self;
+
+        let schema = Arc::new(schema);
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(catalog_names.finish()),
+                Arc::new(schema_names.finish()),
+                Arc::new(table_names.finish()),
+                Arc::new(table_types.finish()),
+            ],
+        )
+        .unwrap();
+
+        MemTable::try_new(schema, vec![vec![batch]]).unwrap()
+    }
+}
+
+/// Builds the `information_schema.COLUMNS` table row by row
+///
+/// Columns are based on https://www.postgresql.org/docs/current/infoschema-columns.html
+struct InformationSchemaColumnsBuilder {
+    catalog_names: StringBuilder,
+    schema_names: StringBuilder,
+    table_names: StringBuilder,
+    column_names: StringBuilder,
+    ordinal_positions: UInt64Builder,
+    column_defaults: StringBuilder,
+    is_nullables: StringBuilder,
+    data_types: StringBuilder,
+    character_maximum_lengths: UInt64Builder,
+    character_octet_lengths: UInt64Builder,
+    numeric_precisions: UInt64Builder,
+    numeric_precision_radixes: UInt64Builder,
+    numeric_scales: UInt64Builder,
+    datetime_precisions: UInt64Builder,
+    interval_types: StringBuilder,
+}
+
+impl InformationSchemaColumnsBuilder {
+    fn new() -> Self {
+        // StringBuilder requires providing an initial capacity, so
+        // pick 10 here arbitrarily as this is not performance
+        // critical code and the number of tables is unavailable here.
+        let default_capacity = 10;
+        Self {
+            catalog_names: StringBuilder::new(default_capacity),
+            schema_names: StringBuilder::new(default_capacity),
+            table_names: StringBuilder::new(default_capacity),
+            column_names: StringBuilder::new(default_capacity),
+            ordinal_positions: UInt64Builder::new(default_capacity),
+            column_defaults: StringBuilder::new(default_capacity),
+            is_nullables: StringBuilder::new(default_capacity),
+            data_types: StringBuilder::new(default_capacity),
+            character_maximum_lengths: UInt64Builder::new(default_capacity),
+            character_octet_lengths: UInt64Builder::new(default_capacity),
+            numeric_precisions: UInt64Builder::new(default_capacity),
+            numeric_precision_radixes: UInt64Builder::new(default_capacity),
+            numeric_scales: UInt64Builder::new(default_capacity),
+            datetime_precisions: UInt64Builder::new(default_capacity),
+            interval_types: StringBuilder::new(default_capacity),
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn add_column(
+        &mut self,
+        catalog_name: impl AsRef<str>,
+        schema_name: impl AsRef<str>,
+        table_name: impl AsRef<str>,
+        column_name: impl AsRef<str>,
+        column_position: usize,
+        is_nullable: bool,
+        data_type: &DataType,
+    ) {
+        use DataType::*;
+
+        // Note: append_value is actually infallable.
+        self.catalog_names
+            .append_value(catalog_name.as_ref())
+            .unwrap();
+        self.schema_names
+            .append_value(schema_name.as_ref())
+            .unwrap();
+        self.table_names.append_value(table_name.as_ref()).unwrap();
+
+        self.column_names
+            .append_value(column_name.as_ref())
+            .unwrap();
+
+        self.ordinal_positions
+            .append_value(column_position as u64)
+            .unwrap();
+
+        // DataFusion does not support column default values, so null
+        self.column_defaults.append_null().unwrap();
+
+        // "YES if the column is possibly nullable, NO if it is known not nullable. "
+        let nullable_str = if is_nullable { "YES" } else { "NO" };
+        self.is_nullables.append_value(nullable_str).unwrap();
+
+        // "System supplied type" --> Use debug format of the datatype
+        self.data_types
+            .append_value(format!("{:?}", data_type))
+            .unwrap();
+
+        // "If data_type identifies a character or bit string type, the
+        // declared maximum length; null for all other data types or
+        // if no maximum length was declared."
+        //
+        // Arrow has no equivalent of VARCHAR(20), so we leave this as Null
+        let max_chars = None;
+        self.character_maximum_lengths
+            .append_option(max_chars)
+            .unwrap();
+
+        // "Maximum length, in bytes, for binary data, character data,
+        // or text and image data."
+        let char_len: Option<u64> = match data_type {
+            Utf8 | Binary => Some(i32::MAX as u64),
+            LargeBinary | LargeUtf8 => Some(i64::MAX as u64),
+            _ => None,
+        };
+        self.character_octet_lengths
+            .append_option(char_len)
+            .unwrap();
+
+        // numeric_precision: "If data_type identifies a numeric type, this column
+        // contains the (declared or implicit) precision of the type
+        // for this column. The precision indicates the number of
+        // significant digits. It can be expressed in decimal (base
+        // 10) or binary (base 2) terms, as specified in the column
+        // numeric_precision_radix. For all other data types, this
+        // column is null."
+        //
+        // numeric_radix: If data_type identifies a numeric type, this
+        // column indicates in which base the values in the columns
+        // numeric_precision and numeric_scale are expressed. The
+        // value is either 2 or 10. For all other data types, this
+        // column is null.
+        //
+        // numeric_scale: If data_type identifies an exact numeric
+        // type, this column contains the (declared or implicit) scale
+        // of the type for this column. The scale indicates the number
+        // of significant digits to the right of the decimal point. It
+        // can be expressed in decimal (base 10) or binary (base 2)
+        // terms, as specified in the column
+        // numeric_precision_radix. For all other data types, this
+        // column is null.
+        let (numeric_precision, numeric_radix, numeric_scale) = match data_type {
+            Int8 | UInt8 => (Some(8), Some(2), None),
+            Int16 | UInt16 => (Some(16), Some(2), None),
+            Int32 | UInt32 => (Some(32), Some(2), None),
+            // From max value of 65504 as explained on
+            // https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Exponent_encoding
+            Float16 => (Some(15), Some(2), None),
+            // Numbers from postgres `real` type
+            Float32 => (Some(24), Some(2), None),
+            // Numbers from postgres `double` type
+            Float64 => (Some(24), Some(2), None),
+            Decimal(precision, scale) => {
+                (Some(*precision as u64), Some(10), Some(*scale as u64))
+            }
+            _ => (None, None, None),
+        };
+
+        self.numeric_precisions
+            .append_option(numeric_precision)
+            .unwrap();
+        self.numeric_precision_radixes
+            .append_option(numeric_radix)
+            .unwrap();
+        self.numeric_scales.append_option(numeric_scale).unwrap();
+
+        self.datetime_precisions.append_option(None).unwrap();
+        self.interval_types.append_null().unwrap();
+    }
+
+    fn build(self) -> MemTable {
+        let schema = Schema::new(vec![
+            Field::new("table_catalog", DataType::Utf8, false),
+            Field::new("table_schema", DataType::Utf8, false),
+            Field::new("table_name", DataType::Utf8, false),
+            Field::new("column_name", DataType::Utf8, false),
+            Field::new("ordinal_position", DataType::UInt64, false),
+            Field::new("column_default", DataType::Utf8, false),
+            Field::new("is_nullable", DataType::Utf8, false),
+            Field::new("data_type", DataType::Utf8, false),
+            Field::new("character_maximum_length", DataType::UInt64, false),
+            Field::new("character_octet_length", DataType::UInt64, false),
+            Field::new("numeric_precision", DataType::UInt64, false),
+            Field::new("numeric_precision_radix", DataType::UInt64, false),
+            Field::new("numeric_scale", DataType::UInt64, false),
+            Field::new("datetime_precision", DataType::UInt64, false),
+            Field::new("interval_type", DataType::Utf8, false),
+        ]);
+
+        let Self {
+            mut catalog_names,
+            mut schema_names,
+            mut table_names,
+            mut column_names,
+            mut ordinal_positions,
+            mut column_defaults,
+            mut is_nullables,
+            mut data_types,
+            mut character_maximum_lengths,
+            mut character_octet_lengths,
+            mut numeric_precisions,
+            mut numeric_precision_radixes,
+            mut numeric_scales,
+            mut datetime_precisions,
+            mut interval_types,
+        } = self;
+
+        let schema = Arc::new(schema);
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(catalog_names.finish()),
+                Arc::new(schema_names.finish()),
+                Arc::new(table_names.finish()),
+                Arc::new(column_names.finish()),
+                Arc::new(ordinal_positions.finish()),
+                Arc::new(column_defaults.finish()),
+                Arc::new(is_nullables.finish()),
+                Arc::new(data_types.finish()),
+                Arc::new(character_maximum_lengths.finish()),
+                Arc::new(character_octet_lengths.finish()),
+                Arc::new(numeric_precisions.finish()),
+                Arc::new(numeric_precision_radixes.finish()),
+                Arc::new(numeric_scales.finish()),
+                Arc::new(datetime_precisions.finish()),
+                Arc::new(interval_types.finish()),
+            ],
+        )
+        .unwrap();
+
+        MemTable::try_new(schema, vec![vec![batch]]).unwrap()
+    }
+}
diff --git a/rust/datafusion/src/catalog/mod.rs b/rust/datafusion/src/catalog/mod.rs
new file mode 100644
index 00000000000..10591f07e37
--- /dev/null
+++ b/rust/datafusion/src/catalog/mod.rs
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This module contains interfaces and default implementations
+//! of table namespacing concepts, including catalogs and schemas.
+
+pub mod catalog;
+pub mod information_schema;
+pub mod schema;
+
+use crate::error::DataFusionError;
+use std::convert::TryFrom;
+
+/// Represents a resolved path to a table of the form "catalog.schema.table"
+#[derive(Clone, Copy)]
+pub struct ResolvedTableReference<'a> {
+    /// The catalog (aka database) containing the table
+    pub catalog: &'a str,
+    /// The schema containing the table
+    pub schema: &'a str,
+    /// The table name
+    pub table: &'a str,
+}
+
+/// Represents a path to a table that may require further resolution
+#[derive(Clone, Copy)]
+pub enum TableReference<'a> {
+    /// An unqualified table reference, e.g. "table"
+    Bare {
+        /// The table name
+        table: &'a str,
+    },
+    /// A partially resolved table reference, e.g. "schema.table"
+    Partial {
+        /// The schema containing the table
+        schema: &'a str,
+        /// The table name
+        table: &'a str,
+    },
+    /// A fully resolved table reference, e.g. "catalog.schema.table"
+    Full {
+        /// The catalog (aka database) containing the table
+        catalog: &'a str,
+        /// The schema containing the table
+        schema: &'a str,
+        /// The table name
+        table: &'a str,
+    },
+}
+
+impl<'a> TableReference<'a> {
+    /// Retrieve the actual table name, regardless of qualification
+    pub fn table(&self) -> &str {
+        match self {
+            Self::Full { table, .. }
+            | Self::Partial { table, .. }
+            | Self::Bare { table } => table,
+        }
+    }
+
+    /// Given a default catalog and schema, ensure this table reference is fully resolved
+    pub fn resolve(
+        self,
+        default_catalog: &'a str,
+        default_schema: &'a str,
+    ) -> ResolvedTableReference<'a> {
+        match self {
+            Self::Full {
+                catalog,
+                schema,
+                table,
+            } => ResolvedTableReference {
+                catalog,
+                schema,
+                table,
+            },
+            Self::Partial { schema, table } => ResolvedTableReference {
+                catalog: default_catalog,
+                schema,
+                table,
+            },
+            Self::Bare { table } => ResolvedTableReference {
+                catalog: default_catalog,
+                schema: default_schema,
+                table,
+            },
+        }
+    }
+}
+
+impl<'a> From<&'a str> for TableReference<'a> {
+    fn from(s: &'a str) -> Self {
+        Self::Bare { table: s }
+    }
+}
+
+impl<'a> From<ResolvedTableReference<'a>> for TableReference<'a> {
+    fn from(resolved: ResolvedTableReference<'a>) -> Self {
+        Self::Full {
+            catalog: resolved.catalog,
+            schema: resolved.schema,
+            table: resolved.table,
+        }
+    }
+}
+
+impl<'a> TryFrom<&'a sqlparser::ast::ObjectName> for TableReference<'a> {
+    type Error = DataFusionError;
+
+    fn try_from(value: &'a sqlparser::ast::ObjectName) -> Result<Self, Self::Error> {
+        let idents = &value.0;
+
+        match idents.len() {
+            1 => Ok(Self::Bare {
+                table: &idents[0].value,
+            }),
+            2 => Ok(Self::Partial {
+                schema: &idents[0].value,
+                table: &idents[1].value,
+            }),
+            3 => Ok(Self::Full {
+                catalog: &idents[0].value,
+                schema: &idents[1].value,
+                table: &idents[2].value,
+            }),
+            _ => Err(DataFusionError::Plan(format!(
+                "invalid table reference: {}",
+                value
+            ))),
+        }
+    }
+}
diff --git a/rust/datafusion/src/catalog/schema.rs b/rust/datafusion/src/catalog/schema.rs
new file mode 100644
index 00000000000..0e39546a5f8
--- /dev/null
+++ b/rust/datafusion/src/catalog/schema.rs
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Describes the interface and built-in implementations of schemas,
+//! representing collections of named tables.
+
+use crate::datasource::TableProvider;
+use crate::error::{DataFusionError, Result};
+use std::any::Any;
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+
+/// Represents a schema, comprising a number of named tables.
+pub trait SchemaProvider: Sync + Send {
+    /// Returns the schema provider as [`Any`](std::any::Any)
+    /// so that it can be downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Retrieves the list of available table names in this schema.
+    fn table_names(&self) -> Vec<String>;
+
+    /// Retrieves a specific table from the schema by name, provided it exists.
+    fn table(&self, name: &str) -> Option<Arc<dyn TableProvider>>;
+
+    /// If supported by the implementation, adds a new table to this schema.
+    /// If a table of the same name existed before, it is replaced in the schema and returned.
+    #[allow(unused_variables)]
+    fn register_table(
+        &self,
+        name: String,
+        table: Arc<dyn TableProvider>,
+    ) -> Result<Option<Arc<dyn TableProvider>>> {
+        Err(DataFusionError::Execution(
+            "schema provider does not support registering tables".to_owned(),
+        ))
+    }
+
+    /// If supported by the implementation, removes an existing table from this schema and returns it.
+    /// If no table of that name exists, returns Ok(None).
+    #[allow(unused_variables)]
+    fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
+        Err(DataFusionError::Execution(
+            "schema provider does not support deregistering tables".to_owned(),
+        ))
+    }
+}
+
+/// Simple in-memory implementation of a schema.
+pub struct MemorySchemaProvider {
+    tables: RwLock<HashMap<String, Arc<dyn TableProvider>>>,
+}
+
+impl MemorySchemaProvider {
+    /// Instantiates a new MemorySchemaProvider with an empty collection of tables.
+    pub fn new() -> Self {
+        Self {
+            tables: RwLock::new(HashMap::new()),
+        }
+    }
+}
+
+impl SchemaProvider for MemorySchemaProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn table_names(&self) -> Vec<String> {
+        let tables = self.tables.read().unwrap();
+        tables.keys().cloned().collect()
+    }
+
+    fn table(&self, name: &str) -> Option<Arc<dyn TableProvider>> {
+        let tables = self.tables.read().unwrap();
+        tables.get(name).cloned()
+    }
+
+    fn register_table(
+        &self,
+        name: String,
+        table: Arc<dyn TableProvider>,
+    ) -> Result<Option<Arc<dyn TableProvider>>> {
+        let mut tables = self.tables.write().unwrap();
+        Ok(tables.insert(name, table))
+    }
+
+    fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
+        let mut tables = self.tables.write().unwrap();
+        Ok(tables.remove(name))
+    }
+}
diff --git a/rust/datafusion/src/dataframe.rs b/rust/datafusion/src/dataframe.rs
index b3e561100c7..9c7c2ef96d6 100644
--- a/rust/datafusion/src/dataframe.rs
+++ b/rust/datafusion/src/dataframe.rs
@@ -44,7 +44,7 @@ use async_trait::async_trait;
 /// let mut ctx = ExecutionContext::new();
 /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
 /// let df = df.filter(col("a").lt_eq(col("b")))?
-///            .aggregate(&[col("a")], &[min(col("b"))])?
+///            .aggregate(vec![col("a")], vec![min(col("b"))])?
 ///            .limit(100)?;
 /// let results = df.collect();
 /// # Ok(())
@@ -75,11 +75,11 @@ pub trait DataFrame: Send + Sync {
     /// # fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
-    /// let df = df.select(&[col("a") * col("b"), col("c")])?;
+    /// let df = df.select(vec![col("a") * col("b"), col("c")])?;
     /// # Ok(())
     /// # }
     /// ```
-    fn select(&self, expr: &[Expr]) -> Result<Arc<dyn DataFrame>>;
+    fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>;
 
     /// Filter a DataFrame to only include rows that match the specified filter expression.
     ///
@@ -105,17 +105,17 @@ pub trait DataFrame: Send + Sync {
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     ///
     /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a"
-    /// let _ = df.aggregate(&[col("a")], &[min(col("b"))])?;
+    /// let _ = df.aggregate(vec![col("a")], vec![min(col("b"))])?;
     ///
     /// // The following use is the equivalent of "SELECT MIN(b)"
-    /// let _ = df.aggregate(&[], &[min(col("b"))])?;
+    /// let _ = df.aggregate(vec![], vec![min(col("b"))])?;
     /// # Ok(())
     /// # }
     /// ```
     fn aggregate(
         &self,
-        group_expr: &[Expr],
-        aggr_expr: &[Expr],
+        group_expr: Vec<Expr>,
+        aggr_expr: Vec<Expr>,
     ) -> Result<Arc<dyn DataFrame>>;
 
     /// Limit the number of rows returned from this DataFrame.
@@ -155,11 +155,11 @@ pub trait DataFrame: Send + Sync {
     /// # fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
-    /// let df = df.sort(&[col("a").sort(true, true), col("b").sort(false, false)])?;
+    /// let df = df.sort(vec![col("a").sort(true, true), col("b").sort(false, false)])?;
     /// # Ok(())
     /// # }
     /// ```
-    fn sort(&self, expr: &[Expr]) -> Result<Arc<dyn DataFrame>>;
+    fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>;
 
     /// Join this DataFrame with another DataFrame using the specified columns as join keys
     ///
@@ -171,7 +171,7 @@ pub trait DataFrame: Send + Sync {
     /// let mut ctx = ExecutionContext::new();
     /// let left = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let right = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?
-    ///   .select(&[
+    ///   .select(vec![
     ///     col("a").alias("a2"),
     ///     col("b").alias("b2"),
     ///     col("c").alias("c2")])?;
diff --git a/rust/datafusion/src/datasource/datasource.rs b/rust/datafusion/src/datasource/datasource.rs
index 4e6ad36160c..e2b07336486 100644
--- a/rust/datafusion/src/datasource/datasource.rs
+++ b/rust/datafusion/src/datasource/datasource.rs
@@ -67,7 +67,7 @@ pub enum TableProviderFilterPushDown {
 }
 
 /// Source table
-pub trait TableProvider {
+pub trait TableProvider: Sync + Send {
     /// Returns the table provider as [`Any`](std::any::Any) so that it can be
     /// downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
diff --git a/rust/datafusion/src/datasource/memory.rs b/rust/datafusion/src/datasource/memory.rs
index 514eac6dcda..af404808702 100644
--- a/rust/datafusion/src/datasource/memory.rs
+++ b/rust/datafusion/src/datasource/memory.rs
@@ -110,7 +110,7 @@ impl MemTable {
 
     /// Create a mem table by reading from another data source
     pub async fn load(
-        t: Arc<dyn TableProvider + Send + Sync>,
+        t: Arc<dyn TableProvider>,
         batch_size: usize,
         output_partitions: Option<usize>,
     ) -> Result<Self> {
diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs
index caf09be708d..20d0067350c 100644
--- a/rust/datafusion/src/execution/context.rs
+++ b/rust/datafusion/src/execution/context.rs
@@ -16,7 +16,13 @@
 // under the License.
 
 //! ExecutionContext contains methods for registering data sources and executing queries
-use crate::optimizer::hash_build_probe_order::HashBuildProbeOrder;
+use crate::{
+    catalog::{
+        catalog::{CatalogList, MemoryCatalogList},
+        information_schema::CatalogWithInformationSchema,
+    },
+    optimizer::hash_build_probe_order::HashBuildProbeOrder,
+};
 use log::debug;
 use std::fs;
 use std::path::Path;
@@ -32,6 +38,11 @@ use tokio::task::{self, JoinHandle};
 
 use arrow::csv;
 
+use crate::catalog::{
+    catalog::{CatalogProvider, MemoryCatalogProvider},
+    schema::{MemorySchemaProvider, SchemaProvider},
+    ResolvedTableReference, TableReference,
+};
 use crate::datasource::csv::CsvFile;
 use crate::datasource::parquet::ParquetTable;
 use crate::datasource::TableProvider;
@@ -77,7 +88,7 @@ use parquet::file::properties::WriterProperties;
 /// let mut ctx = ExecutionContext::new();
 /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
 /// let df = df.filter(col("a").lt_eq(col("b")))?
-///            .aggregate(&[col("a")], &[min(col("b"))])?
+///            .aggregate(vec![col("a")], vec![min(col("b"))])?
 ///            .limit(100)?;
 /// let results = df.collect();
 /// # Ok(())
@@ -111,9 +122,32 @@ impl ExecutionContext {
 
     /// Creates a new execution context using the provided configuration.
     pub fn with_config(config: ExecutionConfig) -> Self {
+        let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc<dyn CatalogList>;
+
+        if config.create_default_catalog_and_schema {
+            let default_catalog = MemoryCatalogProvider::new();
+
+            default_catalog.register_schema(
+                config.default_schema.clone(),
+                Arc::new(MemorySchemaProvider::new()),
+            );
+
+            let default_catalog: Arc<dyn CatalogProvider> = if config.information_schema {
+                Arc::new(CatalogWithInformationSchema::new(
+                    catalog_list.clone(),
+                    Arc::new(default_catalog),
+                ))
+            } else {
+                Arc::new(default_catalog)
+            };
+
+            catalog_list
+                .register_catalog(config.default_catalog.clone(), default_catalog);
+        }
+
         Self {
             state: Arc::new(Mutex::new(ExecutionContextState {
-                datasources: HashMap::new(),
+                catalog_list,
                 scalar_functions: HashMap::new(),
                 var_provider: HashMap::new(),
                 aggregate_functions: HashMap::new(),
@@ -239,7 +273,7 @@ impl ExecutionContext {
     /// Creates a DataFrame for reading a custom TableProvider.
     pub fn read_table(
         &mut self,
-        provider: Arc<dyn TableProvider + Send + Sync>,
+        provider: Arc<dyn TableProvider>,
     ) -> Result<Arc<dyn DataFrame>> {
         let schema = provider.schema();
         let table_scan = LogicalPlan::TableScan {
@@ -264,7 +298,7 @@ impl ExecutionContext {
         filename: &str,
         options: CsvReadOptions,
     ) -> Result<()> {
-        self.register_table(name, Arc::new(CsvFile::try_new(filename, options)?));
+        self.register_table(name, Arc::new(CsvFile::try_new(filename, options)?))?;
         Ok(())
     }
 
@@ -275,48 +309,91 @@ impl ExecutionContext {
             &filename,
             self.state.lock().unwrap().config.concurrency,
         )?;
-        self.register_table(name, Arc::new(table));
+        self.register_table(name, Arc::new(table))?;
         Ok(())
     }
 
-    /// Registers a named table using a custom `TableProvider` so that
+    /// Registers a named catalog using a custom `CatalogProvider` so that
     /// it can be referenced from SQL statements executed against this
     /// context.
     ///
-    /// Returns the `TableProvider` previously registered for this
+    /// Returns the `CatalogProvider` previously registered for this
     /// name, if any
-    pub fn register_table(
-        &mut self,
-        name: &str,
-        provider: Arc<dyn TableProvider + Send + Sync>,
-    ) -> Option<Arc<dyn TableProvider + Send + Sync>> {
+    pub fn register_catalog(
+        &self,
+        name: impl Into<String>,
+        catalog: Arc<dyn CatalogProvider>,
+    ) -> Option<Arc<dyn CatalogProvider>> {
+        let name = name.into();
+
+        let state = self.state.lock().unwrap();
+        let catalog = if state.config.information_schema {
+            Arc::new(CatalogWithInformationSchema::new(
+                state.catalog_list.clone(),
+                catalog,
+            ))
+        } else {
+            catalog
+        };
+
+        state.catalog_list.register_catalog(name, catalog)
+    }
+
+    /// Retrieves a `CatalogProvider` instance by name
+    pub fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>> {
+        self.state.lock().unwrap().catalog_list.catalog(name)
+    }
+
+    /// Registers a table using a custom `TableProvider` so that
+    /// it can be referenced from SQL statements executed against this
+    /// context.
+    ///
+    /// Returns the `TableProvider` previously registered for this
+    /// reference, if any
+    pub fn register_table<'a>(
+        &'a mut self,
+        table_ref: impl Into<TableReference<'a>>,
+        provider: Arc<dyn TableProvider>,
+    ) -> Result<Option<Arc<dyn TableProvider>>> {
+        let table_ref = table_ref.into();
         self.state
             .lock()
             .unwrap()
-            .datasources
-            .insert(name.to_string(), provider)
+            .schema_for_ref(table_ref)?
+            .register_table(table_ref.table().to_owned(), provider)
     }
 
-    /// Deregisters the named table.
+    /// Deregisters the given table.
     ///
     /// Returns the registered provider, if any
-    pub fn deregister_table(
-        &mut self,
-        name: &str,
-    ) -> Option<Arc<dyn TableProvider + Send + Sync>> {
-        self.state.lock().unwrap().datasources.remove(name)
+    pub fn deregister_table<'a>(
+        &'a mut self,
+        table_ref: impl Into<TableReference<'a>>,
+    ) -> Result<Option<Arc<dyn TableProvider>>> {
+        let table_ref = table_ref.into();
+        self.state
+            .lock()
+            .unwrap()
+            .schema_for_ref(table_ref)?
+            .deregister_table(table_ref.table())
     }
 
     /// Retrieves a DataFrame representing a table previously registered by calling the
     /// register_table function.
     ///
-    /// Returns an error if no table has been registered with the provided name.
-    pub fn table(&self, table_name: &str) -> Result<Arc<dyn DataFrame>> {
-        match self.state.lock().unwrap().datasources.get(table_name) {
-            Some(provider) => {
+    /// Returns an error if no table has been registered with the provided reference.
+    pub fn table<'a>(
+        &self,
+        table_ref: impl Into<TableReference<'a>>,
+    ) -> Result<Arc<dyn DataFrame>> {
+        let table_ref = table_ref.into();
+        let schema = self.state.lock().unwrap().schema_for_ref(table_ref)?;
+
+        match schema.table(table_ref.table()) {
+            Some(ref provider) => {
                 let schema = provider.schema();
                 let table_scan = LogicalPlan::TableScan {
-                    table_name: table_name.to_string(),
+                    table_name: table_ref.table().to_owned(),
                     source: Arc::clone(provider),
                     projected_schema: schema.to_dfschema_ref()?,
                     projection: None,
@@ -330,24 +407,30 @@ impl ExecutionContext {
             }
             _ => Err(DataFusionError::Plan(format!(
                 "No table named '{}'",
-                table_name
+                table_ref.table()
             ))),
         }
     }
 
-    /// Returns the set of available tables.
+    /// Returns the set of available tables in the default catalog and schema.
     ///
     /// Use [`table`] to get a specific table.
     ///
     /// [`table`]: ExecutionContext::table
-    pub fn tables(&self) -> HashSet<String> {
-        self.state
+    #[deprecated(
+        note = "Please use the catalog provider interface (`ExecutionContext::catalog`) to examine available catalogs, schemas, and tables"
+    )]
+    pub fn tables(&self) -> Result<HashSet<String>> {
+        Ok(self
+            .state
             .lock()
             .unwrap()
-            .datasources
-            .keys()
+            // a bare reference will always resolve to the default catalog and schema
+            .schema_for_ref(TableReference::Bare { table: "" })?
+            .table_names()
+            .iter()
             .cloned()
-            .collect()
+            .collect())
     }
 
     /// Optimizes the logical plan by applying optimizer rules.
@@ -441,7 +524,7 @@ impl ExecutionContext {
                             .try_collect()
                             .await
                             .map_err(DataFusionError::from)?;
-                        writer.close().map_err(DataFusionError::from)
+                        writer.close().map_err(DataFusionError::from).map(|_| ())
                     });
                     tasks.push(handle);
                 }
@@ -512,6 +595,15 @@ pub struct ExecutionConfig {
     optimizers: Vec<Arc<dyn OptimizerRule + Send + Sync>>,
     /// Responsible for planning `LogicalPlan`s, and `ExecutionPlan`
     query_planner: Arc<dyn QueryPlanner + Send + Sync>,
+    /// Default catalog name for table resolution
+    default_catalog: String,
+    /// Default schema name for table resolution
+    default_schema: String,
+    /// Whether the default catalog and schema should be created automatically
+    create_default_catalog_and_schema: bool,
+    /// Should DataFusion provide access to `information_schema`
+    /// virtual tables for displaying schema information
+    information_schema: bool,
 }
 
 impl ExecutionConfig {
@@ -519,7 +611,7 @@ impl ExecutionConfig {
     pub fn new() -> Self {
         Self {
             concurrency: num_cpus::get(),
-            batch_size: 32768,
+            batch_size: 8192,
             optimizers: vec![
                 Arc::new(ConstantFolding::new()),
                 Arc::new(ProjectionPushDown::new()),
@@ -528,6 +620,10 @@ impl ExecutionConfig {
                 Arc::new(LimitPushDown::new()),
             ],
             query_planner: Arc::new(DefaultQueryPlanner {}),
+            default_catalog: "datafusion".to_owned(),
+            default_schema: "public".to_owned(),
+            create_default_catalog_and_schema: true,
+            information_schema: false,
         }
     }
 
@@ -564,13 +660,36 @@ impl ExecutionConfig {
         self.optimizers.push(optimizer_rule);
         self
     }
+
+    /// Selects a name for the default catalog and schema
+    pub fn with_default_catalog_and_schema(
+        mut self,
+        catalog: impl Into<String>,
+        schema: impl Into<String>,
+    ) -> Self {
+        self.default_catalog = catalog.into();
+        self.default_schema = schema.into();
+        self
+    }
+
+    /// Controls whether the default catalog and schema will be automatically created
+    pub fn create_default_catalog_and_schema(mut self, create: bool) -> Self {
+        self.create_default_catalog_and_schema = create;
+        self
+    }
+
+    /// Enables or disables the inclusion of `information_schema` virtual tables
+    pub fn with_information_schema(mut self, enabled: bool) -> Self {
+        self.information_schema = enabled;
+        self
+    }
 }
 
 /// Execution context for registering data sources and executing queries
 #[derive(Clone)]
 pub struct ExecutionContextState {
-    /// Data sources that are registered with the context
-    pub datasources: HashMap<String, Arc<dyn TableProvider + Send + Sync>>,
+    /// Collection of catalogs containing schemas and ultimately TableProviders
+    pub catalog_list: Arc<dyn CatalogList>,
     /// Scalar functions that are registered with the context
     pub scalar_functions: HashMap<String, Arc<ScalarUDF>>,
     /// Variable provider that are registered with the context
@@ -581,12 +700,45 @@ pub struct ExecutionContextState {
     pub config: ExecutionConfig,
 }
 
+impl ExecutionContextState {
+    fn resolve_table_ref<'a>(
+        &'a self,
+        table_ref: impl Into<TableReference<'a>>,
+    ) -> ResolvedTableReference<'a> {
+        table_ref
+            .into()
+            .resolve(&self.config.default_catalog, &self.config.default_schema)
+    }
+
+    fn schema_for_ref<'a>(
+        &'a self,
+        table_ref: impl Into<TableReference<'a>>,
+    ) -> Result<Arc<dyn SchemaProvider>> {
+        let resolved_ref = self.resolve_table_ref(table_ref.into());
+
+        self.catalog_list
+            .catalog(resolved_ref.catalog)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "failed to resolve catalog: {}",
+                    resolved_ref.catalog
+                ))
+            })?
+            .schema(resolved_ref.schema)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "failed to resolve schema: {}",
+                    resolved_ref.schema
+                ))
+            })
+    }
+}
+
 impl ContextProvider for ExecutionContextState {
-    fn get_table_provider(
-        &self,
-        name: &str,
-    ) -> Option<Arc<dyn TableProvider + Send + Sync>> {
-        self.datasources.get(name).map(|ds| Arc::clone(ds))
+    fn get_table_provider(&self, name: TableReference) -> Option<Arc<dyn TableProvider>> {
+        let resolved_ref = self.resolve_table_ref(name);
+        let schema = self.schema_for_ref(resolved_ref).ok()?;
+        schema.table(resolved_ref.table)
     }
 
     fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
@@ -643,7 +795,9 @@ mod tests {
         physical_plan::expressions::AvgAccumulator,
     };
     use arrow::array::{
-        Array, ArrayRef, DictionaryArray, Float64Array, Int32Array, Int64Array,
+        Array, ArrayRef, BinaryArray, DictionaryArray, Float64Array, Int32Array,
+        Int64Array, LargeBinaryArray, LargeStringArray, StringArray,
+        TimestampNanosecondArray,
     };
     use arrow::compute::add;
     use arrow::datatypes::*;
@@ -731,7 +885,7 @@ mod tests {
         ctx.register_variable(VarType::UserDefined, Arc::new(variable_provider));
 
         let provider = test::create_table_dual();
-        ctx.register_table("dual", provider);
+        ctx.register_table("dual", provider)?;
 
         let results =
             plan_and_collect(&mut ctx, "SELECT @@version, @name FROM dual").await?;
@@ -755,10 +909,10 @@ mod tests {
         let mut ctx = create_ctx(&tmp_dir, partition_count)?;
 
         let provider = test::create_table_dual();
-        ctx.register_table("dual", provider);
+        ctx.register_table("dual", provider)?;
 
-        assert!(ctx.deregister_table("dual").is_some());
-        assert!(ctx.deregister_table("dual").is_none());
+        assert!(ctx.deregister_table("dual")?.is_some());
+        assert!(ctx.deregister_table("dual")?.is_none());
 
         Ok(())
     }
@@ -834,7 +988,7 @@ mod tests {
 
         let table = ctx.table("test")?;
         let logical_plan = LogicalPlanBuilder::from(&table.to_logical_plan())
-            .project(&[col("c2")])?
+            .project(vec![col("c2")])?
             .build()?;
 
         let optimized_plan = ctx.optimize(&logical_plan)?;
@@ -875,18 +1029,11 @@ mod tests {
         let tmp_dir = TempDir::new()?;
         let ctx = create_ctx(&tmp_dir, 1)?;
 
-        let schema = ctx
-            .state
-            .lock()
-            .unwrap()
-            .datasources
-            .get("test")
-            .unwrap()
-            .schema();
+        let schema: Schema = ctx.table("test").unwrap().schema().clone().into();
         assert_eq!(schema.field_with_name("c1")?.is_nullable(), false);
 
-        let plan = LogicalPlanBuilder::scan_empty("", schema.as_ref(), None)?
-            .project(&[col("c1")])?
+        let plan = LogicalPlanBuilder::scan_empty("", &schema, None)?
+            .project(vec![col("c1")])?
             .build()?;
 
         let plan = ctx.optimize(&plan)?;
@@ -917,7 +1064,7 @@ mod tests {
         )?]];
 
         let plan = LogicalPlanBuilder::scan_memory(partitions, schema, None)?
-            .project(&[col("b")])?
+            .project(vec![col("b")])?
             .build()?;
         assert_fields_eq(&plan, vec!["b"]);
 
@@ -1362,7 +1509,7 @@ mod tests {
                 .unwrap();
 
             let provider = MemTable::try_new(schema.clone(), vec![vec![batch]]).unwrap();
-            ctx.register_table("t", Arc::new(provider));
+            ctx.register_table("t", Arc::new(provider)).unwrap();
 
             let results = plan_and_collect(
                 &mut ctx,
@@ -1547,8 +1694,8 @@ mod tests {
         ]));
 
         let plan = LogicalPlanBuilder::scan_empty("", schema.as_ref(), None)?
-            .aggregate(&[col("c1")], &[sum(col("c2"))])?
-            .project(&[col("c1"), col("SUM(c2)").alias("total_salary")])?
+            .aggregate(vec![col("c1")], vec![sum(col("c2"))])?
+            .project(vec![col("c1"), col("SUM(c2)").alias("total_salary")])?
             .build()?;
 
         let plan = ctx.optimize(&plan)?;
@@ -1745,7 +1892,7 @@ mod tests {
         let mut ctx = ExecutionContext::new();
 
         let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch]])?;
-        ctx.register_table("t", Arc::new(provider));
+        ctx.register_table("t", Arc::new(provider))?;
 
         let myfunc = |args: &[ArrayRef]| {
             let l = &args[0]
@@ -1773,7 +1920,7 @@ mod tests {
         let t = ctx.table("t")?;
 
         let plan = LogicalPlanBuilder::from(&t.to_logical_plan())
-            .project(&[
+            .project(vec![
                 col("a"),
                 col("b"),
                 ctx.udf("my_add")?.call(vec![col("a"), col("b")]),
@@ -1825,7 +1972,7 @@ mod tests {
             assert_eq!(a.value(i) + b.value(i), sum.value(i));
         }
 
-        ctx.deregister_table("t");
+        ctx.deregister_table("t")?;
 
         Ok(())
     }
@@ -1847,7 +1994,7 @@ mod tests {
 
         let provider =
             MemTable::try_new(Arc::new(schema), vec![vec![batch1], vec![batch2]])?;
-        ctx.register_table("t", Arc::new(provider));
+        ctx.register_table("t", Arc::new(provider))?;
 
         let result = plan_and_collect(&mut ctx, "SELECT AVG(a) FROM t").await?;
 
@@ -1884,7 +2031,7 @@ mod tests {
 
         let provider =
             MemTable::try_new(Arc::new(schema), vec![vec![batch1], vec![batch2]])?;
-        ctx.register_table("t", Arc::new(provider));
+        ctx.register_table("t", Arc::new(provider))?;
 
         // define a udaf, using a DataFusion's accumulator
         let my_avg = create_udaf(
@@ -1922,6 +2069,384 @@ mod tests {
         Ok(())
     }
 
+    fn table_with_sequence(
+        seq_start: i32,
+        seq_end: i32,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)]));
+        let arr = Arc::new(Int32Array::from((seq_start..=seq_end).collect::<Vec<_>>()));
+        let partitions = vec![vec![RecordBatch::try_new(
+            schema.clone(),
+            vec![arr as ArrayRef],
+        )?]];
+        Ok(Arc::new(MemTable::try_new(schema, partitions)?))
+    }
+
+    #[tokio::test]
+    async fn information_schema_tables_not_exist_by_default() {
+        let mut ctx = ExecutionContext::new();
+
+        let err = plan_and_collect(&mut ctx, "SELECT * from information_schema.tables")
+            .await
+            .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Error during planning: Table or CTE with name 'information_schema.tables' not found"
+        );
+    }
+
+    #[tokio::test]
+    async fn information_schema_tables_no_tables() {
+        let mut ctx = ExecutionContext::with_config(
+            ExecutionConfig::new().with_information_schema(true),
+        );
+
+        let result =
+            plan_and_collect(&mut ctx, "SELECT * from information_schema.tables")
+                .await
+                .unwrap();
+
+        let expected = vec![
+            "+---------------+--------------------+------------+------------+",
+            "| table_catalog | table_schema       | table_name | table_type |",
+            "+---------------+--------------------+------------+------------+",
+            "| datafusion    | information_schema | columns    | VIEW       |",
+            "| datafusion    | information_schema | tables     | VIEW       |",
+            "+---------------+--------------------+------------+------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+    }
+
+    #[tokio::test]
+    async fn information_schema_tables_tables_default_catalog() {
+        let mut ctx = ExecutionContext::with_config(
+            ExecutionConfig::new().with_information_schema(true),
+        );
+
+        // Now, register an empty table
+        ctx.register_table("t", table_with_sequence(1, 1).unwrap())
+            .unwrap();
+
+        let result =
+            plan_and_collect(&mut ctx, "SELECT * from information_schema.tables")
+                .await
+                .unwrap();
+
+        let expected = vec![
+            "+---------------+--------------------+------------+------------+",
+            "| table_catalog | table_schema       | table_name | table_type |",
+            "+---------------+--------------------+------------+------------+",
+            "| datafusion    | information_schema | tables     | VIEW       |",
+            "| datafusion    | information_schema | columns    | VIEW       |",
+            "| datafusion    | public             | t          | BASE TABLE |",
+            "+---------------+--------------------+------------+------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+
+        // Newly added tables should appear
+        ctx.register_table("t2", table_with_sequence(1, 1).unwrap())
+            .unwrap();
+
+        let result =
+            plan_and_collect(&mut ctx, "SELECT * from information_schema.tables")
+                .await
+                .unwrap();
+
+        let expected = vec![
+            "+---------------+--------------------+------------+------------+",
+            "| table_catalog | table_schema       | table_name | table_type |",
+            "+---------------+--------------------+------------+------------+",
+            "| datafusion    | information_schema | columns    | VIEW       |",
+            "| datafusion    | information_schema | tables     | VIEW       |",
+            "| datafusion    | public             | t          | BASE TABLE |",
+            "| datafusion    | public             | t2         | BASE TABLE |",
+            "+---------------+--------------------+------------+------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+    }
+
+    #[tokio::test]
+    async fn information_schema_tables_tables_with_multiple_catalogs() {
+        let mut ctx = ExecutionContext::with_config(
+            ExecutionConfig::new().with_information_schema(true),
+        );
+        let catalog = MemoryCatalogProvider::new();
+        let schema = MemorySchemaProvider::new();
+        schema
+            .register_table("t1".to_owned(), table_with_sequence(1, 1).unwrap())
+            .unwrap();
+        schema
+            .register_table("t2".to_owned(), table_with_sequence(1, 1).unwrap())
+            .unwrap();
+        catalog.register_schema("my_schema", Arc::new(schema));
+        ctx.register_catalog("my_catalog", Arc::new(catalog));
+
+        let catalog = MemoryCatalogProvider::new();
+        let schema = MemorySchemaProvider::new();
+        schema
+            .register_table("t3".to_owned(), table_with_sequence(1, 1).unwrap())
+            .unwrap();
+        catalog.register_schema("my_other_schema", Arc::new(schema));
+        ctx.register_catalog("my_other_catalog", Arc::new(catalog));
+
+        let result =
+            plan_and_collect(&mut ctx, "SELECT * from information_schema.tables")
+                .await
+                .unwrap();
+
+        let expected = vec![
+            "+------------------+--------------------+------------+------------+",
+            "| table_catalog    | table_schema       | table_name | table_type |",
+            "+------------------+--------------------+------------+------------+",
+            "| datafusion       | information_schema | columns    | VIEW       |",
+            "| datafusion       | information_schema | tables     | VIEW       |",
+            "| my_catalog       | information_schema | columns    | VIEW       |",
+            "| my_catalog       | information_schema | tables     | VIEW       |",
+            "| my_catalog       | my_schema          | t1         | BASE TABLE |",
+            "| my_catalog       | my_schema          | t2         | BASE TABLE |",
+            "| my_other_catalog | information_schema | columns    | VIEW       |",
+            "| my_other_catalog | information_schema | tables     | VIEW       |",
+            "| my_other_catalog | my_other_schema    | t3         | BASE TABLE |",
+            "+------------------+--------------------+------------+------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+    }
+
+    #[tokio::test]
+    async fn information_schema_show_tables_no_information_schema() {
+        let mut ctx = ExecutionContext::with_config(ExecutionConfig::new());
+
+        ctx.register_table("t", table_with_sequence(1, 1).unwrap())
+            .unwrap();
+
+        // use show tables alias
+        let err = plan_and_collect(&mut ctx, "SHOW TABLES").await.unwrap_err();
+
+        assert_eq!(err.to_string(), "Error during planning: SHOW TABLES is not supported unless information_schema is enabled");
+    }
+
+    #[tokio::test]
+    async fn information_schema_show_tables() {
+        let mut ctx = ExecutionContext::with_config(
+            ExecutionConfig::new().with_information_schema(true),
+        );
+
+        ctx.register_table("t", table_with_sequence(1, 1).unwrap())
+            .unwrap();
+
+        // use show tables alias
+        let result = plan_and_collect(&mut ctx, "SHOW TABLES").await.unwrap();
+
+        let expected = vec![
+            "+---------------+--------------------+------------+------------+",
+            "| table_catalog | table_schema       | table_name | table_type |",
+            "+---------------+--------------------+------------+------------+",
+            "| datafusion    | information_schema | columns    | VIEW       |",
+            "| datafusion    | information_schema | tables     | VIEW       |",
+            "| datafusion    | public             | t          | BASE TABLE |",
+            "+---------------+--------------------+------------+------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+
+        let result = plan_and_collect(&mut ctx, "SHOW tables").await.unwrap();
+
+        assert_batches_sorted_eq!(expected, &result);
+    }
+
+    #[tokio::test]
+    async fn show_unsupported() {
+        let mut ctx = ExecutionContext::with_config(ExecutionConfig::new());
+
+        let err = plan_and_collect(&mut ctx, "SHOW SOMETHING_UNKNOWN")
+            .await
+            .unwrap_err();
+
+        assert_eq!(err.to_string(), "This feature is not implemented: SHOW SOMETHING_UNKNOWN not implemented. Supported syntax: SHOW <TABLES>");
+    }
+
+    #[tokio::test]
+    async fn information_schema_columns_not_exist_by_default() {
+        let mut ctx = ExecutionContext::new();
+
+        let err = plan_and_collect(&mut ctx, "SELECT * from information_schema.columns")
+            .await
+            .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Error during planning: Table or CTE with name 'information_schema.columns' not found"
+        );
+    }
+
+    fn table_with_many_types() -> Arc<dyn TableProvider> {
+        let schema = Schema::new(vec![
+            Field::new("int32_col", DataType::Int32, false),
+            Field::new("float64_col", DataType::Float64, true),
+            Field::new("utf8_col", DataType::Utf8, true),
+            Field::new("large_utf8_col", DataType::LargeUtf8, false),
+            Field::new("binary_col", DataType::Binary, false),
+            Field::new("large_binary_col", DataType::LargeBinary, false),
+            Field::new(
+                "timestamp_nanos",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+        ]);
+
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(Float64Array::from(vec![1.0])),
+                Arc::new(StringArray::from(vec![Some("foo")])),
+                Arc::new(LargeStringArray::from(vec![Some("bar")])),
+                Arc::new(BinaryArray::from(vec![b"foo" as &[u8]])),
+                Arc::new(LargeBinaryArray::from(vec![b"foo" as &[u8]])),
+                Arc::new(TimestampNanosecondArray::from_opt_vec(
+                    vec![Some(123)],
+                    None,
+                )),
+            ],
+        )
+        .unwrap();
+        let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch]]).unwrap();
+        Arc::new(provider)
+    }
+
+    #[tokio::test]
+    async fn information_schema_columns() {
+        let mut ctx = ExecutionContext::with_config(
+            ExecutionConfig::new().with_information_schema(true),
+        );
+        let catalog = MemoryCatalogProvider::new();
+        let schema = MemorySchemaProvider::new();
+
+        schema
+            .register_table("t1".to_owned(), table_with_sequence(1, 1).unwrap())
+            .unwrap();
+
+        schema
+            .register_table("t2".to_owned(), table_with_many_types())
+            .unwrap();
+        catalog.register_schema("my_schema", Arc::new(schema));
+        ctx.register_catalog("my_catalog", Arc::new(catalog));
+
+        let result =
+            plan_and_collect(&mut ctx, "SELECT * from information_schema.columns")
+                .await
+                .unwrap();
+
+        let expected = vec![
+    "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+",
+    "| table_catalog | table_schema | table_name | column_name      | ordinal_position | column_default | is_nullable | data_type                   | character_maximum_length | character_octet_length | numeric_precision | numeric_precision_radix | numeric_scale | datetime_precision | interval_type |",
+    "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+",
+    "| my_catalog    | my_schema    | t1         | i                | 0                |                | YES         | Int32                       |                          |                        | 32                | 2                       |               |                    |               |",
+    "| my_catalog    | my_schema    | t2         | binary_col       | 4                |                | NO          | Binary                      |                          | 2147483647             |                   |                         |               |                    |               |",
+    "| my_catalog    | my_schema    | t2         | float64_col      | 1                |                | YES         | Float64                     |                          |                        | 24                | 2                       |               |                    |               |",
+    "| my_catalog    | my_schema    | t2         | int32_col        | 0                |                | NO          | Int32                       |                          |                        | 32                | 2                       |               |                    |               |",
+    "| my_catalog    | my_schema    | t2         | large_binary_col | 5                |                | NO          | LargeBinary                 |                          | 9223372036854775807    |                   |                         |               |                    |               |",
+    "| my_catalog    | my_schema    | t2         | large_utf8_col   | 3                |                | NO          | LargeUtf8                   |                          | 9223372036854775807    |                   |                         |               |                    |               |",
+    "| my_catalog    | my_schema    | t2         | timestamp_nanos  | 6                |                | NO          | Timestamp(Nanosecond, None) |                          |                        |                   |                         |               |                    |               |",
+    "| my_catalog    | my_schema    | t2         | utf8_col         | 2                |                | YES         | Utf8                        |                          | 2147483647             |                   |                         |               |                    |               |",
+    "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+    }
+
+    #[tokio::test]
+    async fn disabled_default_catalog_and_schema() -> Result<()> {
+        let mut ctx = ExecutionContext::with_config(
+            ExecutionConfig::new().create_default_catalog_and_schema(false),
+        );
+
+        assert!(matches!(
+            ctx.register_table("test", table_with_sequence(1, 1)?),
+            Err(DataFusionError::Plan(_))
+        ));
+
+        assert!(matches!(
+            ctx.sql("select * from datafusion.public.test"),
+            Err(DataFusionError::Plan(_))
+        ));
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn custom_catalog_and_schema() -> Result<()> {
+        let mut ctx = ExecutionContext::with_config(
+            ExecutionConfig::new()
+                .create_default_catalog_and_schema(false)
+                .with_default_catalog_and_schema("my_catalog", "my_schema"),
+        );
+
+        let catalog = MemoryCatalogProvider::new();
+        let schema = MemorySchemaProvider::new();
+        schema.register_table("test".to_owned(), table_with_sequence(1, 1)?)?;
+        catalog.register_schema("my_schema", Arc::new(schema));
+        ctx.register_catalog("my_catalog", Arc::new(catalog));
+
+        for table_ref in &["my_catalog.my_schema.test", "my_schema.test", "test"] {
+            let result = plan_and_collect(
+                &mut ctx,
+                &format!("SELECT COUNT(*) AS count FROM {}", table_ref),
+            )
+            .await?;
+
+            let expected = vec![
+                "+-------+",
+                "| count |",
+                "+-------+",
+                "| 1     |",
+                "+-------+",
+            ];
+            assert_batches_eq!(expected, &result);
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn cross_catalog_access() -> Result<()> {
+        let mut ctx = ExecutionContext::new();
+
+        let catalog_a = MemoryCatalogProvider::new();
+        let schema_a = MemorySchemaProvider::new();
+        schema_a.register_table("table_a".to_owned(), table_with_sequence(1, 1)?)?;
+        catalog_a.register_schema("schema_a", Arc::new(schema_a));
+        ctx.register_catalog("catalog_a", Arc::new(catalog_a));
+
+        let catalog_b = MemoryCatalogProvider::new();
+        let schema_b = MemorySchemaProvider::new();
+        schema_b.register_table("table_b".to_owned(), table_with_sequence(1, 2)?)?;
+        catalog_b.register_schema("schema_b", Arc::new(schema_b));
+        ctx.register_catalog("catalog_b", Arc::new(catalog_b));
+
+        let result = plan_and_collect(
+            &mut ctx,
+            "SELECT cat, SUM(i) AS total FROM (
+                    SELECT i, 'a' AS cat FROM catalog_a.schema_a.table_a
+                    UNION ALL
+                    SELECT i, 'b' AS cat FROM catalog_b.schema_b.table_b
+                )
+                GROUP BY cat
+                ORDER BY cat
+                ",
+        )
+        .await?;
+
+        let expected = vec![
+            "+-----+-------+",
+            "| cat | total |",
+            "+-----+-------+",
+            "| a   | 1     |",
+            "| b   | 3     |",
+            "+-----+-------+",
+        ];
+        assert_batches_eq!(expected, &result);
+
+        Ok(())
+    }
+
     struct MyPhysicalPlanner {}
 
     impl PhysicalPlanner for MyPhysicalPlanner {
diff --git a/rust/datafusion/src/execution/dataframe_impl.rs b/rust/datafusion/src/execution/dataframe_impl.rs
index 7c3beb954d4..62c18ebc985 100644
--- a/rust/datafusion/src/execution/dataframe_impl.rs
+++ b/rust/datafusion/src/execution/dataframe_impl.rs
@@ -58,11 +58,11 @@ impl DataFrame for DataFrameImpl {
             .map(|name| self.plan.schema().field_with_unqualified_name(name))
             .collect::<Result<Vec<_>>>()?;
         let expr: Vec<Expr> = fields.iter().map(|f| col(f.name())).collect();
-        self.select(&expr)
+        self.select(expr)
     }
 
     /// Create a projection based on arbitrary expressions
-    fn select(&self, expr_list: &[Expr]) -> Result<Arc<dyn DataFrame>> {
+    fn select(&self, expr_list: Vec<Expr>) -> Result<Arc<dyn DataFrame>> {
         let plan = LogicalPlanBuilder::from(&self.plan)
             .project(expr_list)?
             .build()?;
@@ -80,8 +80,8 @@ impl DataFrame for DataFrameImpl {
     /// Perform an aggregate query
     fn aggregate(
         &self,
-        group_expr: &[Expr],
-        aggr_expr: &[Expr],
+        group_expr: Vec<Expr>,
+        aggr_expr: Vec<Expr>,
     ) -> Result<Arc<dyn DataFrame>> {
         let plan = LogicalPlanBuilder::from(&self.plan)
             .aggregate(group_expr, aggr_expr)?
@@ -96,7 +96,7 @@ impl DataFrame for DataFrameImpl {
     }
 
     /// Sort by specified sorting expressions
-    fn sort(&self, expr: &[Expr]) -> Result<Arc<dyn DataFrame>> {
+    fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>> {
         let plan = LogicalPlanBuilder::from(&self.plan).sort(expr)?.build()?;
         Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan)))
     }
@@ -204,7 +204,7 @@ mod tests {
     fn select_expr() -> Result<()> {
         // build plan using Table API
         let t = test_table()?;
-        let t2 = t.select(&[col("c1"), col("c2"), col("c11")])?;
+        let t2 = t.select(vec![col("c1"), col("c2"), col("c11")])?;
         let plan = t2.to_logical_plan();
 
         // build query using SQL
@@ -220,8 +220,8 @@ mod tests {
     fn aggregate() -> Result<()> {
         // build plan using DataFrame API
         let df = test_table()?;
-        let group_expr = &[col("c1")];
-        let aggr_expr = &[
+        let group_expr = vec![col("c1")];
+        let aggr_expr = vec![
             min(col("c12")),
             max(col("c12")),
             avg(col("c12")),
@@ -322,7 +322,7 @@ mod tests {
 
         let f = df.registry();
 
-        let df = df.select(&[f.udf("my_fn")?.call(vec![col("c12")])])?;
+        let df = df.select(vec![f.udf("my_fn")?.call(vec![col("c12")])])?;
         let plan = df.to_logical_plan();
 
         // build query using SQL
diff --git a/rust/datafusion/src/lib.rs b/rust/datafusion/src/lib.rs
index 41c2491ecd8..2733430cbe9 100644
--- a/rust/datafusion/src/lib.rs
+++ b/rust/datafusion/src/lib.rs
@@ -18,9 +18,11 @@
 // Clippy lints, some should be disabled incrementally
 #![allow(
     clippy::float_cmp,
+    clippy::from_over_into,
     clippy::module_inception,
     clippy::new_without_default,
-    clippy::type_complexity
+    clippy::type_complexity,
+    clippy::upper_case_acronyms
 )]
 
 //! [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion)
@@ -48,7 +50,7 @@
 //!
 //! // create a plan
 //! let df = df.filter(col("a").lt_eq(col("b")))?
-//!            .aggregate(&[col("a")], &[min(col("b"))])?
+//!            .aggregate(vec![col("a")], vec![min(col("b"))])?
 //!            .limit(100)?;
 //!
 //! // execute the plan
@@ -184,6 +186,7 @@
 extern crate arrow;
 extern crate sqlparser;
 
+pub mod catalog;
 pub mod dataframe;
 pub mod datasource;
 pub mod error;
diff --git a/rust/datafusion/src/logical_plan/builder.rs b/rust/datafusion/src/logical_plan/builder.rs
index 58dfd0fa5d6..e748872882b 100644
--- a/rust/datafusion/src/logical_plan/builder.rs
+++ b/rust/datafusion/src/logical_plan/builder.rs
@@ -39,6 +39,43 @@ use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, Partitioning};
 use std::collections::HashSet;
 
 /// Builder for logical plans
+///
+/// ```
+/// # use datafusion::prelude::*;
+/// # use datafusion::logical_plan::LogicalPlanBuilder;
+/// # use datafusion::error::Result;
+/// # use arrow::datatypes::{Schema, DataType, Field};
+/// #
+/// # fn main() -> Result<()> {
+/// #
+/// # fn employee_schema() -> Schema {
+/// #    Schema::new(vec![
+/// #           Field::new("id", DataType::Int32, false),
+/// #           Field::new("first_name", DataType::Utf8, false),
+/// #           Field::new("last_name", DataType::Utf8, false),
+/// #           Field::new("state", DataType::Utf8, false),
+/// #           Field::new("salary", DataType::Int32, false),
+/// #       ])
+/// #   }
+/// #
+/// // Create a plan similar to
+/// // SELECT last_name
+/// // FROM employees
+/// // WHERE salary < 1000
+/// let plan = LogicalPlanBuilder::scan_empty(
+///              "employee.csv",
+///              &employee_schema(),
+///              None,
+///            )?
+///            // Keep only rows where salary < 1000
+///            .filter(col("salary").lt_eq(lit(1000)))?
+///            // only show "last_name" in the final results
+///            .project(vec![col("last_name")])?
+///            .build()?;
+///
+/// # Ok(())
+/// # }
+/// ```
 pub struct LogicalPlanBuilder {
     plan: LogicalPlan,
 }
@@ -103,7 +140,7 @@ impl LogicalPlanBuilder {
     /// Convert a table provider into a builder with a TableScan
     pub fn scan(
         name: &str,
-        provider: Arc<dyn TableProvider + Send + Sync>,
+        provider: Arc<dyn TableProvider>,
         projection: Option<Vec<usize>>,
     ) -> Result<Self> {
         let schema = provider.schema();
@@ -132,18 +169,21 @@ impl LogicalPlanBuilder {
     /// This function errors under any of the following conditions:
     /// * Two or more expressions have the same name
     /// * An invalid expression is used (e.g. a `sort` expression)
-    pub fn project(&self, expr: &[Expr]) -> Result<Self> {
+    pub fn project(&self, expr: impl IntoIterator<Item = Expr>) -> Result<Self> {
         let input_schema = self.plan.schema();
         let mut projected_expr = vec![];
-        (0..expr.len()).for_each(|i| match &expr[i] {
-            Expr::Wildcard => {
-                (0..input_schema.fields().len())
-                    .for_each(|i| projected_expr.push(col(input_schema.field(i).name())));
-            }
-            _ => projected_expr.push(expr[i].clone()),
-        });
+        for e in expr {
+            match e {
+                Expr::Wildcard => {
+                    (0..input_schema.fields().len()).for_each(|i| {
+                        projected_expr.push(col(input_schema.field(i).name()))
+                    });
+                }
+                _ => projected_expr.push(e),
+            };
+        }
 
-        validate_unique_names("Projections", &projected_expr, input_schema)?;
+        validate_unique_names("Projections", projected_expr.iter(), input_schema)?;
 
         let schema = DFSchema::new(exprlist_to_fields(&projected_expr, input_schema)?)?;
 
@@ -171,9 +211,9 @@ impl LogicalPlanBuilder {
     }
 
     /// Apply a sort
-    pub fn sort(&self, expr: &[Expr]) -> Result<Self> {
+    pub fn sort(&self, expr: impl IntoIterator<Item = Expr>) -> Result<Self> {
         Ok(Self::from(&LogicalPlan::Sort {
-            expr: expr.to_vec(),
+            expr: expr.into_iter().collect(),
             input: Arc::new(self.plan.clone()),
         }))
     }
@@ -243,20 +283,28 @@ impl LogicalPlanBuilder {
         }))
     }
 
-    /// Apply an aggregate
-    pub fn aggregate(&self, group_expr: &[Expr], aggr_expr: &[Expr]) -> Result<Self> {
-        let mut all_expr = group_expr.to_vec();
-        all_expr.extend_from_slice(aggr_expr);
+    /// Apply an aggregate: grouping on the `group_expr` expressions
+    /// and calculating `aggr_expr` aggregates for each distinct
+    /// value of the `group_expr`;
+    pub fn aggregate(
+        &self,
+        group_expr: impl IntoIterator<Item = Expr>,
+        aggr_expr: impl IntoIterator<Item = Expr>,
+    ) -> Result<Self> {
+        let group_expr = group_expr.into_iter().collect::<Vec<Expr>>();
+        let aggr_expr = aggr_expr.into_iter().collect::<Vec<Expr>>();
+
+        let all_expr = group_expr.iter().chain(aggr_expr.iter());
 
-        validate_unique_names("Aggregations", &all_expr, self.plan.schema())?;
+        validate_unique_names("Aggregations", all_expr.clone(), self.plan.schema())?;
 
         let aggr_schema =
-            DFSchema::new(exprlist_to_fields(&all_expr, self.plan.schema())?)?;
+            DFSchema::new(exprlist_to_fields(all_expr, self.plan.schema())?)?;
 
         Ok(Self::from(&LogicalPlan::Aggregate {
             input: Arc::new(self.plan.clone()),
-            group_expr: group_expr.to_vec(),
-            aggr_expr: aggr_expr.to_vec(),
+            group_expr,
+            aggr_expr,
             schema: DFSchemaRef::new(aggr_schema),
         }))
     }
@@ -334,13 +382,13 @@ fn build_join_schema(
 }
 
 /// Errors if one or more expressions have equal names.
-fn validate_unique_names(
+fn validate_unique_names<'a>(
     node_name: &str,
-    expressions: &[Expr],
+    expressions: impl IntoIterator<Item = &'a Expr>,
     input_schema: &DFSchema,
 ) -> Result<()> {
     let mut unique_names = HashMap::new();
-    expressions.iter().enumerate().try_for_each(|(position, expr)| {
+    expressions.into_iter().enumerate().try_for_each(|(position, expr)| {
         let name = expr.name(input_schema)?;
         match unique_names.get(&name) {
             None => {
@@ -375,7 +423,7 @@ mod tests {
             Some(vec![0, 3]),
         )?
         .filter(col("state").eq(lit("CO")))?
-        .project(&[col("id")])?
+        .project(vec![col("id")])?
         .build()?;
 
         let expected = "Projection: #id\
@@ -394,8 +442,11 @@ mod tests {
             &employee_schema(),
             Some(vec![3, 4]),
         )?
-        .aggregate(&[col("state")], &[sum(col("salary")).alias("total_salary")])?
-        .project(&[col("state"), col("total_salary")])?
+        .aggregate(
+            vec![col("state")],
+            vec![sum(col("salary")).alias("total_salary")],
+        )?
+        .project(vec![col("state"), col("total_salary")])?
         .build()?;
 
         let expected = "Projection: #state, #total_salary\
@@ -414,7 +465,7 @@ mod tests {
             &employee_schema(),
             Some(vec![3, 4]),
         )?
-        .sort(&[
+        .sort(vec![
             Expr::Sort {
                 expr: Box::new(col("state")),
                 asc: true,
@@ -470,7 +521,7 @@ mod tests {
             Some(vec![0, 3]),
         )?
         // two columns with the same name => error
-        .project(&[col("id"), col("first_name").alias("id")]);
+        .project(vec![col("id"), col("first_name").alias("id")]);
 
         match plan {
             Err(DataFusionError::Plan(e)) => {
@@ -496,7 +547,7 @@ mod tests {
             Some(vec![0, 3]),
         )?
         // two columns with the same name => error
-        .aggregate(&[col("state")], &[sum(col("salary")).alias("state")]);
+        .aggregate(vec![col("state")], vec![sum(col("salary")).alias("state")]);
 
         match plan {
             Err(DataFusionError::Plan(e)) => {
diff --git a/rust/datafusion/src/logical_plan/expr.rs b/rust/datafusion/src/logical_plan/expr.rs
index 1eaa02b1e41..991b16058b1 100644
--- a/rust/datafusion/src/logical_plan/expr.rs
+++ b/rust/datafusion/src/logical_plan/expr.rs
@@ -1090,6 +1090,7 @@ unary_scalar_expr!(Lpad, lpad);
 unary_scalar_expr!(Ltrim, ltrim);
 unary_scalar_expr!(MD5, md5);
 unary_scalar_expr!(OctetLength, octet_length);
+unary_scalar_expr!(RegexpMatch, regexp_match);
 unary_scalar_expr!(RegexpReplace, regexp_replace);
 unary_scalar_expr!(Replace, replace);
 unary_scalar_expr!(Repeat, repeat);
@@ -1370,11 +1371,11 @@ fn create_name(e: &Expr, input_schema: &DFSchema) -> Result<String> {
 }
 
 /// Create field meta-data from an expression, for use in a result set schema
-pub fn exprlist_to_fields(
-    expr: &[Expr],
+pub fn exprlist_to_fields<'a>(
+    expr: impl IntoIterator<Item = &'a Expr>,
     input_schema: &DFSchema,
 ) -> Result<Vec<DFField>> {
-    expr.iter().map(|e| e.to_field(input_schema)).collect()
+    expr.into_iter().map(|e| e.to_field(input_schema)).collect()
 }
 
 #[cfg(test)]
diff --git a/rust/datafusion/src/logical_plan/mod.rs b/rust/datafusion/src/logical_plan/mod.rs
index 0e7e61981b1..f9be1ff9830 100644
--- a/rust/datafusion/src/logical_plan/mod.rs
+++ b/rust/datafusion/src/logical_plan/mod.rs
@@ -37,10 +37,10 @@ pub use expr::{
     ceil, character_length, chr, col, combine_filters, concat, concat_ws, cos, count,
     count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list,
     initcap, left, length, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, min,
-    octet_length, or, regexp_replace, repeat, replace, reverse, right, round, rpad,
-    rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with,
-    strpos, substr, sum, tan, to_hex, translate, trim, trunc, upper, when, Expr,
-    ExprRewriter, ExpressionVisitor, Literal, Recursion,
+    octet_length, or, regexp_match, regexp_replace, repeat, replace, reverse, right,
+    round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt,
+    starts_with, strpos, substr, sum, tan, to_hex, translate, trim, trunc, upper, when,
+    Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion,
 };
 pub use extension::UserDefinedLogicalNode;
 pub use operators::Operator;
diff --git a/rust/datafusion/src/logical_plan/plan.rs b/rust/datafusion/src/logical_plan/plan.rs
index 92110c8daf1..d1b9b827a5a 100644
--- a/rust/datafusion/src/logical_plan/plan.rs
+++ b/rust/datafusion/src/logical_plan/plan.rs
@@ -134,7 +134,7 @@ pub enum LogicalPlan {
         /// The name of the table
         table_name: String,
         /// The source of the table
-        source: Arc<dyn TableProvider + Send + Sync>,
+        source: Arc<dyn TableProvider>,
         /// Optional column indices to use as a projection
         projection: Option<Vec<usize>>,
         /// The schema description of the output
@@ -673,13 +673,11 @@ impl LogicalPlan {
                         partitioning_scheme,
                         ..
                     } => match partitioning_scheme {
-                        Partitioning::RoundRobinBatch(n) => {
-                            write!(
-                                f,
-                                "Repartition: RoundRobinBatch partition_count={}",
-                                n
-                            )
-                        }
+                        Partitioning::RoundRobinBatch(n) => write!(
+                            f,
+                            "Repartition: RoundRobinBatch partition_count={}",
+                            n
+                        ),
                         Partitioning::Hash(expr, n) => {
                             let hash_expr: Vec<String> =
                                 expr.iter().map(|e| format!("{:?}", e)).collect();
@@ -788,7 +786,7 @@ mod tests {
         .unwrap()
         .filter(col("state").eq(lit("CO")))
         .unwrap()
-        .project(&[col("id")])
+        .project(vec![col("id")])
         .unwrap()
         .build()
         .unwrap()
@@ -1089,7 +1087,7 @@ mod tests {
             .unwrap()
             .filter(col("state").eq(lit("CO")))
             .unwrap()
-            .project(&[col("id")])
+            .project(vec![col("id")])
             .unwrap()
             .build()
             .unwrap()
diff --git a/rust/datafusion/src/optimizer/constant_folding.rs b/rust/datafusion/src/optimizer/constant_folding.rs
index ec4dfd4b011..2fa03eb5c70 100644
--- a/rust/datafusion/src/optimizer/constant_folding.rs
+++ b/rust/datafusion/src/optimizer/constant_folding.rs
@@ -469,7 +469,7 @@ mod tests {
         let plan = LogicalPlanBuilder::from(&table_scan)
             .filter(col("b").eq(lit(true)))?
             .filter(col("c").eq(lit(false)))?
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .build()?;
 
         let expected = "\
@@ -489,7 +489,7 @@ mod tests {
             .filter(col("b").not_eq(lit(true)))?
             .filter(col("c").not_eq(lit(false)))?
             .limit(1)?
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .build()?;
 
         let expected = "\
@@ -508,7 +508,7 @@ mod tests {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
             .filter(col("b").not_eq(lit(true)).and(col("c").eq(lit(true))))?
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .build()?;
 
         let expected = "\
@@ -525,7 +525,7 @@ mod tests {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
             .filter(col("b").not_eq(lit(true)).or(col("c").eq(lit(false))))?
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .build()?;
 
         let expected = "\
@@ -542,7 +542,7 @@ mod tests {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
             .filter(col("b").eq(lit(false)).not())?
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .build()?;
 
         let expected = "\
@@ -558,7 +558,7 @@ mod tests {
     fn optimize_plan_support_projection() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("d"), col("b").eq(lit(false))])?
+            .project(vec![col("a"), col("d"), col("b").eq(lit(false))])?
             .build()?;
 
         let expected = "\
@@ -573,10 +573,10 @@ mod tests {
     fn optimize_plan_support_aggregate() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("c"), col("b")])?
+            .project(vec![col("a"), col("c"), col("b")])?
             .aggregate(
-                &[col("a"), col("c")],
-                &[max(col("b").eq(lit(true))), min(col("b"))],
+                vec![col("a"), col("c")],
+                vec![max(col("b").eq(lit(true))), min(col("b"))],
             )?
             .build()?;
 
diff --git a/rust/datafusion/src/optimizer/filter_push_down.rs b/rust/datafusion/src/optimizer/filter_push_down.rs
index 0ae8e06015d..ec260a41dc5 100644
--- a/rust/datafusion/src/optimizer/filter_push_down.rs
+++ b/rust/datafusion/src/optimizer/filter_push_down.rs
@@ -451,7 +451,7 @@ mod tests {
     fn filter_before_projection() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("b")])?
+            .project(vec![col("a"), col("b")])?
             .filter(col("a").eq(lit(1i64)))?
             .build()?;
         // filter is before projection
@@ -467,7 +467,7 @@ mod tests {
     fn filter_after_limit() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("b")])?
+            .project(vec![col("a"), col("b")])?
             .limit(10)?
             .filter(col("a").eq(lit(1i64)))?
             .build()?;
@@ -485,8 +485,8 @@ mod tests {
     fn filter_jump_2_plans() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("b"), col("c")])?
-            .project(&[col("c"), col("b")])?
+            .project(vec![col("a"), col("b"), col("c")])?
+            .project(vec![col("c"), col("b")])?
             .filter(col("a").eq(lit(1i64)))?
             .build()?;
         // filter is before double projection
@@ -503,7 +503,7 @@ mod tests {
     fn filter_move_agg() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .aggregate(&[col("a")], &[sum(col("b")).alias("total_salary")])?
+            .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_salary")])?
             .filter(col("a").gt(lit(10i64)))?
             .build()?;
         // filter of key aggregation is commutative
@@ -519,7 +519,7 @@ mod tests {
     fn filter_keep_agg() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .aggregate(&[col("a")], &[sum(col("b")).alias("b")])?
+            .aggregate(vec![col("a")], vec![sum(col("b")).alias("b")])?
             .filter(col("b").gt(lit(10i64)))?
             .build()?;
         // filter of aggregate is after aggregation since they are non-commutative
@@ -536,7 +536,7 @@ mod tests {
     fn alias() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a").alias("b"), col("c")])?
+            .project(vec![col("a").alias("b"), col("c")])?
             .filter(col("b").eq(lit(1i64)))?
             .build()?;
         // filter is before projection
@@ -569,7 +569,7 @@ mod tests {
     fn complex_expression() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[
+            .project(vec![
                 add(multiply(col("a"), lit(2)), col("c")).alias("b"),
                 col("c"),
             ])?
@@ -599,12 +599,12 @@ mod tests {
     fn complex_plan() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[
+            .project(vec![
                 add(multiply(col("a"), lit(2)), col("c")).alias("b"),
                 col("c"),
             ])?
             // second projection where we rename columns, just to make it difficult
-            .project(&[multiply(col("b"), lit(3)).alias("a"), col("c")])?
+            .project(vec![multiply(col("b"), lit(3)).alias("a"), col("c")])?
             .filter(col("a").eq(lit(1i64)))?
             .build()?;
 
@@ -635,8 +635,8 @@ mod tests {
         // the aggregation allows one filter to pass (b), and the other one to not pass (SUM(c))
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a").alias("b"), col("c")])?
-            .aggregate(&[col("b")], &[sum(col("c"))])?
+            .project(vec![col("a").alias("b"), col("c")])?
+            .aggregate(vec![col("b")], vec![sum(col("c"))])?
             .filter(col("b").gt(lit(10i64)))?
             .filter(col("SUM(c)").gt(lit(10i64)))?
             .build()?;
@@ -671,8 +671,8 @@ mod tests {
         // the aggregation allows one filter to pass (b), and the other one to not pass (SUM(c))
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a").alias("b"), col("c")])?
-            .aggregate(&[col("b")], &[sum(col("c"))])?
+            .project(vec![col("a").alias("b"), col("c")])?
+            .aggregate(vec![col("b")], vec![sum(col("c"))])?
             .filter(and(
                 col("SUM(c)").gt(lit(10i64)),
                 and(col("b").gt(lit(10i64)), col("SUM(c)").lt(lit(20i64))),
@@ -706,10 +706,10 @@ mod tests {
     fn double_limit() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("b")])?
+            .project(vec![col("a"), col("b")])?
             .limit(20)?
             .limit(10)?
-            .project(&[col("a"), col("b")])?
+            .project(vec![col("a"), col("b")])?
             .filter(col("a").eq(lit(1i64)))?
             .build()?;
         // filter does not just any of the limits
@@ -729,10 +729,10 @@ mod tests {
     fn filter_2_breaks_limits() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .filter(col("a").lt_eq(lit(1i64)))?
             .limit(1)?
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .filter(col("a").gt_eq(lit(1i64)))?
             .build()?;
         // Should be able to move both filters below the projections
@@ -768,7 +768,7 @@ mod tests {
             .limit(1)?
             .filter(col("a").lt_eq(lit(1i64)))?
             .filter(col("a").gt_eq(lit(1i64)))?
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .build()?;
 
         // not part of the test
@@ -820,7 +820,7 @@ mod tests {
         let table_scan = test_table_scan()?;
         let left = LogicalPlanBuilder::from(&table_scan).build()?;
         let right = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .build()?;
         let plan = LogicalPlanBuilder::from(&left)
             .join(&right, JoinType::Inner, &["a"], &["a"])?
@@ -855,10 +855,10 @@ mod tests {
     fn filter_join_on_common_dependent() -> Result<()> {
         let table_scan = test_table_scan()?;
         let left = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("c")])?
+            .project(vec![col("a"), col("c")])?
             .build()?;
         let right = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("b")])?
+            .project(vec![col("a"), col("b")])?
             .build()?;
         let plan = LogicalPlanBuilder::from(&left)
             .join(&right, JoinType::Inner, &["a"], &["a"])?
@@ -889,10 +889,10 @@ mod tests {
     fn filter_join_on_one_side() -> Result<()> {
         let table_scan = test_table_scan()?;
         let left = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("b")])?
+            .project(vec![col("a"), col("b")])?
             .build()?;
         let right = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("c")])?
+            .project(vec![col("a"), col("c")])?
             .build()?;
         let plan = LogicalPlanBuilder::from(&left)
             .join(&right, JoinType::Inner, &["a"], &["a"])?
diff --git a/rust/datafusion/src/optimizer/limit_push_down.rs b/rust/datafusion/src/optimizer/limit_push_down.rs
index fee03988c06..73a231f2248 100644
--- a/rust/datafusion/src/optimizer/limit_push_down.rs
+++ b/rust/datafusion/src/optimizer/limit_push_down.rs
@@ -153,7 +153,7 @@ mod test {
         let table_scan = test_table_scan()?;
 
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a")])?
+            .project(vec![col("a")])?
             .limit(1000)?
             .build()?;
 
@@ -193,7 +193,7 @@ mod test {
         let table_scan = test_table_scan()?;
 
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .aggregate(&[col("a")], &[max(col("b"))])?
+            .aggregate(vec![col("a")], vec![max(col("b"))])?
             .limit(1000)?
             .build()?;
 
@@ -235,7 +235,7 @@ mod test {
 
         let plan = LogicalPlanBuilder::from(&table_scan)
             .limit(1000)?
-            .aggregate(&[col("a")], &[max(col("b"))])?
+            .aggregate(vec![col("a")], vec![max(col("b"))])?
             .limit(10)?
             .build()?;
 
diff --git a/rust/datafusion/src/optimizer/projection_push_down.rs b/rust/datafusion/src/optimizer/projection_push_down.rs
index 84523217574..6b1cdfe18ca 100644
--- a/rust/datafusion/src/optimizer/projection_push_down.rs
+++ b/rust/datafusion/src/optimizer/projection_push_down.rs
@@ -303,7 +303,7 @@ mod tests {
         let table_scan = test_table_scan()?;
 
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .aggregate(&[], &[max(col("b"))])?
+            .aggregate(vec![], vec![max(col("b"))])?
             .build()?;
 
         let expected = "Aggregate: groupBy=[[]], aggr=[[MAX(#b)]]\
@@ -319,7 +319,7 @@ mod tests {
         let table_scan = test_table_scan()?;
 
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .aggregate(&[col("c")], &[max(col("b"))])?
+            .aggregate(vec![col("c")], vec![max(col("b"))])?
             .build()?;
 
         let expected = "Aggregate: groupBy=[[#c]], aggr=[[MAX(#b)]]\
@@ -336,7 +336,7 @@ mod tests {
 
         let plan = LogicalPlanBuilder::from(&table_scan)
             .filter(col("c"))?
-            .aggregate(&[], &[max(col("b"))])?
+            .aggregate(vec![], vec![max(col("b"))])?
             .build()?;
 
         let expected = "Aggregate: groupBy=[[]], aggr=[[MAX(#b)]]\
@@ -353,7 +353,7 @@ mod tests {
         let table_scan = test_table_scan()?;
 
         let projection = LogicalPlanBuilder::from(&table_scan)
-            .project(&[Expr::Cast {
+            .project(vec![Expr::Cast {
                 expr: Box::new(col("c")),
                 data_type: DataType::Float64,
             }])?
@@ -374,7 +374,7 @@ mod tests {
         assert_fields_eq(&table_scan, vec!["a", "b", "c"]);
 
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("a"), col("b")])?
+            .project(vec![col("a"), col("b")])?
             .build()?;
 
         assert_fields_eq(&plan, vec!["a", "b"]);
@@ -394,7 +394,7 @@ mod tests {
         assert_fields_eq(&table_scan, vec!["a", "b", "c"]);
 
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("c"), col("a")])?
+            .project(vec![col("c"), col("a")])?
             .limit(5)?
             .build()?;
 
@@ -423,7 +423,7 @@ mod tests {
     fn table_scan_with_literal_projection() -> Result<()> {
         let table_scan = test_table_scan()?;
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[lit(1_i64), lit(2_i64)])?
+            .project(vec![lit(1_i64), lit(2_i64)])?
             .build()?;
         let expected = "Projection: Int64(1), Int64(2)\
                       \n  TableScan: test projection=Some([0])";
@@ -440,9 +440,9 @@ mod tests {
 
         // we never use "b" in the first projection => remove it
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("c"), col("a"), col("b")])?
+            .project(vec![col("c"), col("a"), col("b")])?
             .filter(col("c").gt(lit(1)))?
-            .aggregate(&[col("c")], &[max(col("a"))])?
+            .aggregate(vec![col("c")], vec![max(col("a"))])?
             .build()?;
 
         assert_fields_eq(&plan, vec!["c", "MAX(a)"]);
@@ -467,8 +467,8 @@ mod tests {
 
         // there is no need for the first projection
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("b")])?
-            .project(&[lit(1).alias("a")])?
+            .project(vec![col("b")])?
+            .project(vec![lit(1).alias("a")])?
             .build()?;
 
         assert_fields_eq(&plan, vec!["a"]);
@@ -488,8 +488,8 @@ mod tests {
         let table_scan = test_table_scan()?;
 
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .project(&[col("b")])?
-            .project(&[lit(1).alias("a")])?
+            .project(vec![col("b")])?
+            .project(vec![lit(1).alias("a")])?
             .build()?;
 
         let optimized_plan1 = optimize(&plan).expect("failed to optimize plan");
@@ -511,9 +511,9 @@ mod tests {
 
         // we never use "min(b)" => remove it
         let plan = LogicalPlanBuilder::from(&table_scan)
-            .aggregate(&[col("a"), col("c")], &[max(col("b")), min(col("b"))])?
+            .aggregate(vec![col("a"), col("c")], vec![max(col("b")), min(col("b"))])?
             .filter(col("c").gt(lit(1)))?
-            .project(&[col("c"), col("a"), col("MAX(b)")])?
+            .project(vec![col("c"), col("a"), col("MAX(b)")])?
             .build()?;
 
         assert_fields_eq(&plan, vec!["c", "a", "MAX(b)"]);
diff --git a/rust/datafusion/src/physical_plan/expressions/binary.rs b/rust/datafusion/src/physical_plan/expressions/binary.rs
index 9e048c9d4fd..362ef9cc6d5 100644
--- a/rust/datafusion/src/physical_plan/expressions/binary.rs
+++ b/rust/datafusion/src/physical_plan/expressions/binary.rs
@@ -21,7 +21,7 @@ use arrow::array::*;
 use arrow::compute::kernels::arithmetic::{
     add, divide, divide_scalar, multiply, subtract,
 };
-use arrow::compute::kernels::boolean::{and, or};
+use arrow::compute::kernels::boolean::{and_kleene, or_kleene};
 use arrow::compute::kernels::comparison::{eq, gt, gt_eq, lt, lt_eq, neq};
 use arrow::compute::kernels::comparison::{
     eq_scalar, gt_eq_scalar, gt_scalar, lt_eq_scalar, lt_scalar, neq_scalar,
@@ -505,7 +505,7 @@ impl PhysicalExpr for BinaryExpr {
             Operator::Divide => binary_primitive_array_op!(left, right, divide),
             Operator::And => {
                 if left_data_type == DataType::Boolean {
-                    boolean_op!(left, right, and)
+                    boolean_op!(left, right, and_kleene)
                 } else {
                     return Err(DataFusionError::Internal(format!(
                         "Cannot evaluate binary expression {:?} with types {:?} and {:?}",
@@ -517,7 +517,7 @@ impl PhysicalExpr for BinaryExpr {
             }
             Operator::Or => {
                 if left_data_type == DataType::Boolean {
-                    boolean_op!(left, right, or)
+                    boolean_op!(left, right, or_kleene)
                 } else {
                     return Err(DataFusionError::Internal(format!(
                         "Cannot evaluate binary expression {:?} with types {:?} and {:?}",
@@ -978,6 +978,112 @@ mod tests {
         Ok(())
     }
 
+    fn apply_logic_op(
+        schema: SchemaRef,
+        left: BooleanArray,
+        right: BooleanArray,
+        op: Operator,
+        expected: BooleanArray,
+    ) -> Result<()> {
+        let arithmetic_op = binary_simple(col("a"), op, col("b"));
+        let data: Vec<ArrayRef> = vec![Arc::new(left), Arc::new(right)];
+        let batch = RecordBatch::try_new(schema, data)?;
+        let result = arithmetic_op.evaluate(&batch)?.into_array(batch.num_rows());
+
+        assert_eq!(result.as_ref(), &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn and_with_nulls_op() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Boolean, true),
+            Field::new("b", DataType::Boolean, true),
+        ]);
+        let a = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(false),
+            None,
+        ]);
+        let b = BooleanArray::from(vec![
+            Some(true),
+            Some(true),
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(false),
+            None,
+            None,
+            None,
+        ]);
+
+        let expected = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            Some(false),
+            Some(false),
+            Some(false),
+            None,
+            Some(false),
+            None,
+        ]);
+        apply_logic_op(Arc::new(schema), a, b, Operator::And, expected)?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn or_with_nulls_op() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Boolean, true),
+            Field::new("b", DataType::Boolean, true),
+        ]);
+        let a = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            Some(false),
+            None,
+        ]);
+        let b = BooleanArray::from(vec![
+            Some(true),
+            Some(true),
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(false),
+            None,
+            None,
+            None,
+        ]);
+
+        let expected = BooleanArray::from(vec![
+            Some(true),
+            Some(true),
+            Some(true),
+            Some(true),
+            Some(false),
+            None,
+            Some(true),
+            None,
+            None,
+        ]);
+        apply_logic_op(Arc::new(schema), a, b, Operator::Or, expected)?;
+
+        Ok(())
+    }
+
     #[test]
     fn test_coersion_error() -> Result<()> {
         let expr =
diff --git a/rust/datafusion/src/physical_plan/functions.rs b/rust/datafusion/src/physical_plan/functions.rs
index 9dc54a4113f..56365fec1dc 100644
--- a/rust/datafusion/src/physical_plan/functions.rs
+++ b/rust/datafusion/src/physical_plan/functions.rs
@@ -198,6 +198,8 @@ pub enum BuiltinScalarFunction {
     Trim,
     /// upper
     Upper,
+    /// regexp_match
+    RegexpMatch,
 }
 
 impl fmt::Display for BuiltinScalarFunction {
@@ -271,7 +273,7 @@ impl FromStr for BuiltinScalarFunction {
             "translate" => BuiltinScalarFunction::Translate,
             "trim" => BuiltinScalarFunction::Trim,
             "upper" => BuiltinScalarFunction::Upper,
-
+            "regexp_match" => BuiltinScalarFunction::RegexpMatch,
             _ => {
                 return Err(DataFusionError::Plan(format!(
                     "There is no built-in function named {}",
@@ -607,6 +609,20 @@ pub fn return_type(
                 ));
             }
         }),
+        BuiltinScalarFunction::RegexpMatch => Ok(match arg_types[0] {
+            DataType::LargeUtf8 => {
+                DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, true)))
+            }
+            DataType::Utf8 => {
+                DataType::List(Box::new(Field::new("item", DataType::Utf8, true)))
+            }
+            _ => {
+                // this error is internal as `data_types` should have captured this.
+                return Err(DataFusionError::Internal(
+                    "The regexp_extract function can only accept strings.".to_string(),
+                ));
+            }
+        }),
 
         BuiltinScalarFunction::Abs
         | BuiltinScalarFunction::Acos
@@ -853,6 +869,28 @@ pub fn create_physical_expr(
                 _ => unreachable!(),
             },
         },
+        BuiltinScalarFunction::RegexpMatch => |args| match args[0].data_type() {
+            DataType::Utf8 => {
+                let func = invoke_if_regex_expressions_feature_flag!(
+                    regexp_match,
+                    i32,
+                    "regexp_match"
+                );
+                make_scalar_function(func)(args)
+            }
+            DataType::LargeUtf8 => {
+                let func = invoke_if_regex_expressions_feature_flag!(
+                    regexp_match,
+                    i64,
+                    "regexp_match"
+                );
+                make_scalar_function(func)(args)
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function regexp_match",
+                other
+            ))),
+        },
         BuiltinScalarFunction::RegexpReplace => |args| match args[0].data_type() {
             DataType::Utf8 => {
                 let func = invoke_if_regex_expressions_feature_flag!(
@@ -1229,6 +1267,12 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature {
         BuiltinScalarFunction::NullIf => {
             Signature::Uniform(2, SUPPORTED_NULLIF_TYPES.to_vec())
         }
+        BuiltinScalarFunction::RegexpMatch => Signature::OneOf(vec![
+            Signature::Exact(vec![DataType::Utf8, DataType::Utf8]),
+            Signature::Exact(vec![DataType::LargeUtf8, DataType::Utf8]),
+            Signature::Exact(vec![DataType::Utf8, DataType::Utf8, DataType::Utf8]),
+            Signature::Exact(vec![DataType::LargeUtf8, DataType::Utf8, DataType::Utf8]),
+        ]),
         // math expressions expect 1 argument of type f64 or f32
         // priority is given to f64 because e.g. `sqrt(1i32)` is in IR (real numbers) and thus we
         // return the best approximation for it (in f64).
@@ -1386,7 +1430,7 @@ mod tests {
     use arrow::{
         array::{
             Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeListArray, Float64Array,
-            Int32Array, StringArray, UInt32Array, UInt64Array,
+            Int32Array, ListArray, StringArray, UInt32Array, UInt64Array,
         },
         datatypes::Field,
         record_batch::RecordBatch,
@@ -3646,4 +3690,78 @@ mod tests {
             "PrimitiveArray<UInt64>\n[\n  1,\n  1,\n]",
         )
     }
+
+    #[test]
+    #[cfg(feature = "regex_expressions")]
+    fn test_regexp_match() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+
+        // concat(value, value)
+        let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"]));
+        let pattern = lit(ScalarValue::Utf8(Some(r".*-(\d*)".to_string())));
+        let columns: Vec<ArrayRef> = vec![col_value];
+        let expr = create_physical_expr(
+            &BuiltinScalarFunction::RegexpMatch,
+            &[col("a"), pattern],
+            &schema,
+        )?;
+
+        // type is correct
+        assert_eq!(
+            expr.data_type(&schema)?,
+            DataType::List(Box::new(Field::new("item", DataType::Utf8, true)))
+        );
+
+        // evaluate works
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows());
+
+        // downcast works
+        let result = result.as_any().downcast_ref::<ListArray>().unwrap();
+        let first_row = result.value(0);
+        let first_row = first_row.as_any().downcast_ref::<StringArray>().unwrap();
+
+        // value is correct
+        let expected = "555".to_string();
+        assert_eq!(first_row.value(0), expected);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(feature = "regex_expressions")]
+    fn test_regexp_match_all_literals() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+
+        // concat(value, value)
+        let col_value = lit(ScalarValue::Utf8(Some("aaa-555".to_string())));
+        let pattern = lit(ScalarValue::Utf8(Some(r".*-(\d*)".to_string())));
+        let columns: Vec<ArrayRef> = vec![Arc::new(Int32Array::from(vec![1]))];
+        let expr = create_physical_expr(
+            &BuiltinScalarFunction::RegexpMatch,
+            &[col_value, pattern],
+            &schema,
+        )?;
+
+        // type is correct
+        assert_eq!(
+            expr.data_type(&schema)?,
+            DataType::List(Box::new(Field::new("item", DataType::Utf8, true)))
+        );
+
+        // evaluate works
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows());
+
+        // downcast works
+        let result = result.as_any().downcast_ref::<ListArray>().unwrap();
+        let first_row = result.value(0);
+        let first_row = first_row.as_any().downcast_ref::<StringArray>().unwrap();
+
+        // value is correct
+        let expected = "555".to_string();
+        assert_eq!(first_row.value(0), expected);
+
+        Ok(())
+    }
 }
diff --git a/rust/datafusion/src/physical_plan/group_scalar.rs b/rust/datafusion/src/physical_plan/group_scalar.rs
index f73658dc219..a55e1d7a9a3 100644
--- a/rust/datafusion/src/physical_plan/group_scalar.rs
+++ b/rust/datafusion/src/physical_plan/group_scalar.rs
@@ -38,6 +38,7 @@ pub(crate) enum GroupByScalar {
     Int64(i64),
     Utf8(Box<String>),
     Boolean(bool),
+    TimeMillisecond(i64),
     TimeMicrosecond(i64),
     TimeNanosecond(i64),
     Date32(i32),
@@ -63,6 +64,9 @@ impl TryFrom<&ScalarValue> for GroupByScalar {
             ScalarValue::UInt16(Some(v)) => GroupByScalar::UInt16(*v),
             ScalarValue::UInt32(Some(v)) => GroupByScalar::UInt32(*v),
             ScalarValue::UInt64(Some(v)) => GroupByScalar::UInt64(*v),
+            ScalarValue::TimeMillisecond(Some(v)) => GroupByScalar::TimeMillisecond(*v),
+            ScalarValue::TimeMicrosecond(Some(v)) => GroupByScalar::TimeMicrosecond(*v),
+            ScalarValue::TimeNanosecond(Some(v)) => GroupByScalar::TimeNanosecond(*v),
             ScalarValue::Utf8(Some(v)) => GroupByScalar::Utf8(Box::new(v.clone())),
             ScalarValue::Float32(None)
             | ScalarValue::Float64(None)
@@ -106,6 +110,7 @@ impl From<&GroupByScalar> for ScalarValue {
             GroupByScalar::UInt32(v) => ScalarValue::UInt32(Some(*v)),
             GroupByScalar::UInt64(v) => ScalarValue::UInt64(Some(*v)),
             GroupByScalar::Utf8(v) => ScalarValue::Utf8(Some(v.to_string())),
+            GroupByScalar::TimeMillisecond(v) => ScalarValue::TimeMillisecond(Some(*v)),
             GroupByScalar::TimeMicrosecond(v) => ScalarValue::TimeMicrosecond(Some(*v)),
             GroupByScalar::TimeNanosecond(v) => ScalarValue::TimeNanosecond(Some(*v)),
             GroupByScalar::Date32(v) => ScalarValue::Date32(Some(*v)),
diff --git a/rust/datafusion/src/physical_plan/hash_aggregate.rs b/rust/datafusion/src/physical_plan/hash_aggregate.rs
index 0666dee338c..1a4cb17ea39 100644
--- a/rust/datafusion/src/physical_plan/hash_aggregate.rs
+++ b/rust/datafusion/src/physical_plan/hash_aggregate.rs
@@ -58,7 +58,9 @@ use hashbrown::HashMap;
 use ordered_float::OrderedFloat;
 use pin_project_lite::pin_project;
 
-use arrow::array::{TimestampMicrosecondArray, TimestampNanosecondArray};
+use arrow::array::{
+    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+};
 use async_trait::async_trait;
 
 use super::{
@@ -496,6 +498,13 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec<u8>) -> Result<(
             let array = col.as_any().downcast_ref::<Int64Array>().unwrap();
             vec.extend_from_slice(&array.value(row).to_le_bytes());
         }
+        DataType::Timestamp(TimeUnit::Millisecond, None) => {
+            let array = col
+                .as_any()
+                .downcast_ref::<TimestampMillisecondArray>()
+                .unwrap();
+            vec.extend_from_slice(&array.value(row).to_le_bytes());
+        }
         DataType::Timestamp(TimeUnit::Microsecond, None) => {
             let array = col
                 .as_any()
@@ -923,6 +932,9 @@ fn create_batch_from_map(
                         Arc::new(StringArray::from(vec![&***str]))
                     }
                     GroupByScalar::Boolean(b) => Arc::new(BooleanArray::from(vec![*b])),
+                    GroupByScalar::TimeMillisecond(n) => {
+                        Arc::new(TimestampMillisecondArray::from(vec![*n]))
+                    }
                     GroupByScalar::TimeMicrosecond(n) => {
                         Arc::new(TimestampMicrosecondArray::from(vec![*n]))
                     }
@@ -1073,6 +1085,13 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result<GroupByScalar> {
             let array = col.as_any().downcast_ref::<BooleanArray>().unwrap();
             Ok(GroupByScalar::Boolean(array.value(row)))
         }
+        DataType::Timestamp(TimeUnit::Millisecond, None) => {
+            let array = col
+                .as_any()
+                .downcast_ref::<TimestampMillisecondArray>()
+                .unwrap();
+            Ok(GroupByScalar::TimeMillisecond(array.value(row)))
+        }
         DataType::Timestamp(TimeUnit::Microsecond, None) => {
             let array = col
                 .as_any()
diff --git a/rust/datafusion/src/physical_plan/hash_join.rs b/rust/datafusion/src/physical_plan/hash_join.rs
index 7ca769a5303..91c01c4286e 100644
--- a/rust/datafusion/src/physical_plan/hash_join.rs
+++ b/rust/datafusion/src/physical_plan/hash_join.rs
@@ -30,6 +30,7 @@ use arrow::{
     compute,
     datatypes::{TimeUnit, UInt32Type, UInt64Type},
 };
+use smallvec::{smallvec, SmallVec};
 use std::time::Instant;
 use std::{any::Any, collections::HashSet};
 use std::{hash::Hasher, sync::Arc};
@@ -64,7 +65,7 @@ use log::debug;
 // Maps a `u64` hash value based on the left ["on" values] to a list of indices with this key's value.
 // E.g. 1 -> [3, 6, 8] indicates that the column values map to rows 3, 6 and 8 for hash value 1
 // As the key is a hash value, we need to check possible hash collisions in the probe stage
-type JoinHashMap = HashMap<u64, Vec<u64>, IdHashBuilder>;
+type JoinHashMap = HashMap<u64, SmallVec<[u64; 1]>, IdHashBuilder>;
 type JoinLeftData = Arc<(JoinHashMap, RecordBatch)>;
 
 /// join execution plan executes partitions in parallel and combines them into a set of
@@ -335,7 +336,7 @@ fn update_hash(
         hash.raw_entry_mut()
             .from_key_hashed_nocheck(*hash_value, hash_value)
             .and_modify(|_, v| v.push((row + offset) as u64))
-            .or_insert_with(|| (*hash_value, vec![(row + offset) as u64]));
+            .or_insert_with(|| (*hash_value, smallvec![(row + offset) as u64]));
     }
     Ok(())
 }
@@ -656,42 +657,20 @@ fn equal_rows(
         .zip(right_arrays)
         .all(|(l, r)| match l.data_type() {
             DataType::Null => true,
-            DataType::Boolean => {
-                equal_rows_elem!(BooleanArray, l, r, left, right)
-            }
-            DataType::Int8 => {
-                equal_rows_elem!(Int8Array, l, r, left, right)
-            }
-            DataType::Int16 => {
-                equal_rows_elem!(Int16Array, l, r, left, right)
-            }
-            DataType::Int32 => {
-                equal_rows_elem!(Int32Array, l, r, left, right)
-            }
-            DataType::Int64 => {
-                equal_rows_elem!(Int64Array, l, r, left, right)
-            }
-            DataType::UInt8 => {
-                equal_rows_elem!(UInt8Array, l, r, left, right)
-            }
-            DataType::UInt16 => {
-                equal_rows_elem!(UInt16Array, l, r, left, right)
-            }
-            DataType::UInt32 => {
-                equal_rows_elem!(UInt32Array, l, r, left, right)
-            }
-            DataType::UInt64 => {
-                equal_rows_elem!(UInt64Array, l, r, left, right)
-            }
+            DataType::Boolean => equal_rows_elem!(BooleanArray, l, r, left, right),
+            DataType::Int8 => equal_rows_elem!(Int8Array, l, r, left, right),
+            DataType::Int16 => equal_rows_elem!(Int16Array, l, r, left, right),
+            DataType::Int32 => equal_rows_elem!(Int32Array, l, r, left, right),
+            DataType::Int64 => equal_rows_elem!(Int64Array, l, r, left, right),
+            DataType::UInt8 => equal_rows_elem!(UInt8Array, l, r, left, right),
+            DataType::UInt16 => equal_rows_elem!(UInt16Array, l, r, left, right),
+            DataType::UInt32 => equal_rows_elem!(UInt32Array, l, r, left, right),
+            DataType::UInt64 => equal_rows_elem!(UInt64Array, l, r, left, right),
             DataType::Timestamp(_, None) => {
                 equal_rows_elem!(Int64Array, l, r, left, right)
             }
-            DataType::Utf8 => {
-                equal_rows_elem!(StringArray, l, r, left, right)
-            }
-            DataType::LargeUtf8 => {
-                equal_rows_elem!(LargeStringArray, l, r, left, right)
-            }
+            DataType::Utf8 => equal_rows_elem!(StringArray, l, r, left, right),
+            DataType::LargeUtf8 => equal_rows_elem!(LargeStringArray, l, r, left, right),
             _ => {
                 // This is internal because we should have caught this before.
                 err = Some(Err(DataFusionError::Internal(
@@ -1178,8 +1157,8 @@ mod tests {
             create_hashes(&[left.columns()[0].clone()], &random_state, hashes_buff)?;
 
         // Create hash collisions
-        hashmap_left.insert(hashes[0], vec![0, 1]);
-        hashmap_left.insert(hashes[1], vec![0, 1]);
+        hashmap_left.insert(hashes[0], smallvec![0, 1]);
+        hashmap_left.insert(hashes[1], smallvec![0, 1]);
 
         let right = build_table_i32(
             ("a", &vec![10, 20]),
diff --git a/rust/datafusion/src/physical_plan/parquet.rs b/rust/datafusion/src/physical_plan/parquet.rs
index 5e9b1562751..fce85e36074 100644
--- a/rust/datafusion/src/physical_plan/parquet.rs
+++ b/rust/datafusion/src/physical_plan/parquet.rs
@@ -30,7 +30,10 @@ use super::{
     planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr, RecordBatchStream,
     SendableRecordBatchStream,
 };
-use crate::physical_plan::{common, ExecutionPlan, Partitioning};
+use crate::{
+    catalog::catalog::MemoryCatalogList,
+    physical_plan::{common, ExecutionPlan, Partitioning},
+};
 use crate::{
     error::{DataFusionError, Result},
     execution::context::ExecutionContextState,
@@ -392,7 +395,7 @@ impl RowGroupPredicateBuilder {
             .collect::<Vec<_>>();
         let stat_schema = Schema::new(stat_fields);
         let execution_context_state = ExecutionContextState {
-            datasources: HashMap::new(),
+            catalog_list: Arc::new(MemoryCatalogList::new()),
             scalar_functions: HashMap::new(),
             var_provider: HashMap::new(),
             aggregate_functions: HashMap::new(),
diff --git a/rust/datafusion/src/physical_plan/planner.rs b/rust/datafusion/src/physical_plan/planner.rs
index c6f321ca9b8..dd4184c5e1c 100644
--- a/rust/datafusion/src/physical_plan/planner.rs
+++ b/rust/datafusion/src/physical_plan/planner.rs
@@ -749,10 +749,13 @@ fn tuple_err<T, R>(value: (Result<T>, Result<R>)) -> Result<(T, R)> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::logical_plan::{DFField, DFSchema, DFSchemaRef};
     use crate::physical_plan::{csv::CsvReadOptions, expressions, Partitioning};
     use crate::prelude::ExecutionConfig;
     use crate::scalar::ScalarValue;
+    use crate::{
+        catalog::catalog::MemoryCatalogList,
+        logical_plan::{DFField, DFSchema, DFSchemaRef},
+    };
     use crate::{
         logical_plan::{col, lit, sum, LogicalPlanBuilder},
         physical_plan::SendableRecordBatchStream,
@@ -764,7 +767,7 @@ mod tests {
 
     fn make_ctx_state() -> ExecutionContextState {
         ExecutionContextState {
-            datasources: HashMap::new(),
+            catalog_list: Arc::new(MemoryCatalogList::new()),
             scalar_functions: HashMap::new(),
             var_provider: HashMap::new(),
             aggregate_functions: HashMap::new(),
@@ -787,9 +790,9 @@ mod tests {
         let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)?
             // filter clause needs the type coercion rule applied
             .filter(col("c7").lt(lit(5_u8)))?
-            .project(&[col("c1"), col("c2")])?
-            .aggregate(&[col("c1")], &[sum(col("c2"))])?
-            .sort(&[col("c1").sort(true, true)])?
+            .project(vec![col("c1"), col("c2")])?
+            .aggregate(vec![col("c1")], vec![sum(col("c2"))])?
+            .sort(vec![col("c1").sort(true, true)])?
             .limit(10)?
             .build()?;
 
@@ -860,7 +863,7 @@ mod tests {
         ];
         for case in cases {
             let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)?
-                .project(&[case.clone()]);
+                .project(vec![case.clone()]);
             let message = format!(
                 "Expression {:?} expected to error due to impossible coercion",
                 case
@@ -951,7 +954,7 @@ mod tests {
         let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)?
             // filter clause needs the type coercion rule applied
             .filter(col("c12").lt(lit(0.05)))?
-            .project(&[col("c1").in_list(list, false)])?
+            .project(vec![col("c1").in_list(list, false)])?
             .build()?;
         let execution_plan = plan(&logical_plan)?;
         // verify that the plan correctly adds cast from Int64(1) to Utf8
@@ -966,7 +969,7 @@ mod tests {
         let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)?
             // filter clause needs the type coercion rule applied
             .filter(col("c12").lt(lit(0.05)))?
-            .project(&[col("c12").lt_eq(lit(0.025)).in_list(list, false)])?
+            .project(vec![col("c12").lt_eq(lit(0.025)).in_list(list, false)])?
             .build()?;
         let execution_plan = plan(&logical_plan);
 
@@ -991,7 +994,7 @@ mod tests {
 
         let options = CsvReadOptions::new().schema_infer_max_records(100);
         let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)?
-            .aggregate(&[col("c1")], &[sum(col("c2"))])?
+            .aggregate(vec![col("c1")], vec![sum(col("c2"))])?
             .build()?;
 
         let execution_plan = plan(&logical_plan)?;
diff --git a/rust/datafusion/src/physical_plan/regex_expressions.rs b/rust/datafusion/src/physical_plan/regex_expressions.rs
index 6482424e105..b526e7259ef 100644
--- a/rust/datafusion/src/physical_plan/regex_expressions.rs
+++ b/rust/datafusion/src/physical_plan/regex_expressions.rs
@@ -26,6 +26,7 @@ use std::sync::Arc;
 
 use crate::error::{DataFusionError, Result};
 use arrow::array::{ArrayRef, GenericStringArray, StringOffsetSizeTrait};
+use arrow::compute;
 use hashbrown::HashMap;
 use regex::Regex;
 
@@ -43,6 +44,20 @@ macro_rules! downcast_string_arg {
     }};
 }
 
+/// extract a specific group from a string column, using a regular expression
+pub fn regexp_match<T: StringOffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args.len() {
+        2 => compute::regexp_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T), None)
+        .map_err(DataFusionError::ArrowError),
+        3 => compute::regexp_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T),  Some(downcast_string_arg!(args[1], "flags", T)))
+        .map_err(DataFusionError::ArrowError),
+        other => Err(DataFusionError::Internal(format!(
+            "regexp_match was called with {} arguments. It requires at least 2 and at most 3.",
+            other
+        ))),
+    }
+}
+
 /// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
 /// used by regexp_replace
 fn regex_replace_posix_groups(replacement: &str) -> String {
diff --git a/rust/datafusion/src/scalar.rs b/rust/datafusion/src/scalar.rs
index ca0e27dd687..b2367758493 100644
--- a/rust/datafusion/src/scalar.rs
+++ b/rust/datafusion/src/scalar.rs
@@ -19,6 +19,11 @@
 
 use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc};
 
+use arrow::array::{
+    ArrayRef, Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder,
+    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+    UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder,
+};
 use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
 use arrow::{
     array::*,
@@ -67,6 +72,8 @@ pub enum ScalarValue {
     Date32(Option<i32>),
     /// Date stored as a signed 64bit int
     Date64(Option<i64>),
+    /// Timestamp Milliseconds
+    TimeMillisecond(Option<i64>),
     /// Timestamp Microseconds
     TimeMicrosecond(Option<i64>),
     /// Timestamp Nanoseconds
@@ -108,7 +115,7 @@ macro_rules! build_list {
                     for scalar_value in values {
                         match scalar_value {
                             ScalarValue::$SCALAR_TY(Some(v)) => {
-                                builder.values().append_value(*v).unwrap()
+                                builder.values().append_value(v.clone()).unwrap()
                             }
                             ScalarValue::$SCALAR_TY(None) => {
                                 builder.values().append_null().unwrap();
@@ -144,6 +151,9 @@ impl ScalarValue {
             ScalarValue::TimeNanosecond(_) => {
                 DataType::Timestamp(TimeUnit::Nanosecond, None)
             }
+            ScalarValue::TimeMillisecond(_) => {
+                DataType::Timestamp(TimeUnit::Millisecond, None)
+            }
             ScalarValue::Float32(_) => DataType::Float32,
             ScalarValue::Float64(_) => DataType::Float64,
             ScalarValue::Utf8(_) => DataType::Utf8,
@@ -177,7 +187,7 @@ impl ScalarValue {
             ScalarValue::Int16(Some(v)) => ScalarValue::Int16(Some(-v)),
             ScalarValue::Int32(Some(v)) => ScalarValue::Int32(Some(-v)),
             ScalarValue::Int64(Some(v)) => ScalarValue::Int64(Some(-v)),
-            _ => panic!("Cannot run arithmetic negate on scala value: {:?}", self),
+            _ => panic!("Cannot run arithmetic negate on scalar value: {:?}", self),
         }
     }
 
@@ -199,6 +209,9 @@ impl ScalarValue {
                 | ScalarValue::Utf8(None)
                 | ScalarValue::LargeUtf8(None)
                 | ScalarValue::List(None, _)
+                | ScalarValue::TimeMillisecond(None)
+                | ScalarValue::TimeMicrosecond(None)
+                | ScalarValue::TimeNanosecond(None)
         )
     }
 
@@ -214,78 +227,67 @@ impl ScalarValue {
                 Arc::new(BooleanArray::from(vec![*e; size])) as ArrayRef
             }
             ScalarValue::Float64(e) => match e {
-                Some(value) => {
-                    Arc::new(Float64Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Float64Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Float64, size),
             },
             ScalarValue::Float32(e) => match e {
-                Some(value) => {
-                    Arc::new(Float32Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Float32Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Float32, size),
             },
             ScalarValue::Int8(e) => match e {
-                Some(value) => {
-                    Arc::new(Int8Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Int8Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Int8, size),
             },
             ScalarValue::Int16(e) => match e {
-                Some(value) => {
-                    Arc::new(Int16Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Int16Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Int16, size),
             },
             ScalarValue::Int32(e) => match e {
-                Some(value) => {
-                    Arc::new(Int32Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Int32Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Int32, size),
             },
             ScalarValue::Int64(e) => match e {
-                Some(value) => {
-                    Arc::new(Int64Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Int64Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Int64, size),
             },
             ScalarValue::UInt8(e) => match e {
-                Some(value) => {
-                    Arc::new(UInt8Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(UInt8Array::from_value(*value, size)),
                 None => new_null_array(&DataType::UInt8, size),
             },
             ScalarValue::UInt16(e) => match e {
-                Some(value) => {
-                    Arc::new(UInt16Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(UInt16Array::from_value(*value, size)),
                 None => new_null_array(&DataType::UInt16, size),
             },
             ScalarValue::UInt32(e) => match e {
-                Some(value) => {
-                    Arc::new(UInt32Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(UInt32Array::from_value(*value, size)),
                 None => new_null_array(&DataType::UInt32, size),
             },
             ScalarValue::UInt64(e) => match e {
-                Some(value) => {
-                    Arc::new(UInt64Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(UInt64Array::from_value(*value, size)),
                 None => new_null_array(&DataType::UInt64, size),
             },
-            ScalarValue::TimeMicrosecond(e) => match e {
-                Some(value) => Arc::new(TimestampMicrosecondArray::from_iter_values(
+            ScalarValue::TimeMillisecond(e) => match e {
+                Some(value) => Arc::new(TimestampMillisecondArray::from_iter_values(
                     repeat(*value).take(size),
                 )),
+                None => new_null_array(
+                    &DataType::Timestamp(TimeUnit::Millisecond, None),
+                    size,
+                ),
+            },
+            ScalarValue::TimeMicrosecond(e) => match e {
+                Some(value) => {
+                    Arc::new(TimestampMicrosecondArray::from_value(*value, size))
+                }
                 None => new_null_array(
                     &DataType::Timestamp(TimeUnit::Microsecond, None),
                     size,
                 ),
             },
             ScalarValue::TimeNanosecond(e) => match e {
-                Some(value) => Arc::new(TimestampNanosecondArray::from_iter_values(
-                    repeat(*value).take(size),
-                )),
+                Some(value) => {
+                    Arc::new(TimestampNanosecondArray::from_value(*value, size))
+                }
                 None => {
                     new_null_array(&DataType::Timestamp(TimeUnit::Nanosecond, None), size)
                 }
@@ -333,30 +335,26 @@ impl ScalarValue {
                 DataType::UInt16 => build_list!(UInt16Builder, UInt16, values, size),
                 DataType::UInt32 => build_list!(UInt32Builder, UInt32, values, size),
                 DataType::UInt64 => build_list!(UInt64Builder, UInt64, values, size),
+                DataType::Utf8 => build_list!(StringBuilder, Utf8, values, size),
+                DataType::LargeUtf8 => {
+                    build_list!(LargeStringBuilder, LargeUtf8, values, size)
+                }
                 _ => panic!("Unexpected DataType for list"),
             }),
             ScalarValue::Date32(e) => match e {
-                Some(value) => {
-                    Arc::new(Date32Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Date32Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Date32, size),
             },
             ScalarValue::Date64(e) => match e {
-                Some(value) => {
-                    Arc::new(Date64Array::from_iter_values(repeat(*value).take(size)))
-                }
+                Some(value) => Arc::new(Date64Array::from_value(*value, size)),
                 None => new_null_array(&DataType::Date64, size),
             },
             ScalarValue::IntervalDayTime(e) => match e {
-                Some(value) => Arc::new(IntervalDayTimeArray::from_iter_values(
-                    repeat(*value).take(size),
-                )),
+                Some(value) => Arc::new(IntervalDayTimeArray::from_value(*value, size)),
                 None => new_null_array(&DataType::Interval(IntervalUnit::DayTime), size),
             },
             ScalarValue::IntervalYearMonth(e) => match e {
-                Some(value) => Arc::new(IntervalYearMonthArray::from_iter_values(
-                    repeat(*value).take(size),
-                )),
+                Some(value) => Arc::new(IntervalYearMonthArray::from_value(*value, size)),
                 None => {
                     new_null_array(&DataType::Interval(IntervalUnit::YearMonth), size)
                 }
@@ -599,6 +597,7 @@ impl fmt::Display for ScalarValue {
             ScalarValue::UInt16(e) => format_option!(f, e)?,
             ScalarValue::UInt32(e) => format_option!(f, e)?,
             ScalarValue::UInt64(e) => format_option!(f, e)?,
+            ScalarValue::TimeMillisecond(e) => format_option!(f, e)?,
             ScalarValue::TimeMicrosecond(e) => format_option!(f, e)?,
             ScalarValue::TimeNanosecond(e) => format_option!(f, e)?,
             ScalarValue::Utf8(e) => format_option!(f, e)?,
@@ -659,6 +658,7 @@ impl fmt::Debug for ScalarValue {
             ScalarValue::UInt16(_) => write!(f, "UInt16({})", self),
             ScalarValue::UInt32(_) => write!(f, "UInt32({})", self),
             ScalarValue::UInt64(_) => write!(f, "UInt64({})", self),
+            ScalarValue::TimeMillisecond(_) => write!(f, "TimeMillisecond({})", self),
             ScalarValue::TimeMicrosecond(_) => write!(f, "TimeMicrosecond({})", self),
             ScalarValue::TimeNanosecond(_) => write!(f, "TimeNanosecond({})", self),
             ScalarValue::Utf8(None) => write!(f, "Utf8({})", self),
diff --git a/rust/datafusion/src/sql/planner.rs b/rust/datafusion/src/sql/planner.rs
index 31cea52a0f8..5d638a3e449 100644
--- a/rust/datafusion/src/sql/planner.rs
+++ b/rust/datafusion/src/sql/planner.rs
@@ -17,9 +17,11 @@
 
 //! SQL Query Planner (produces logical plan from SQL AST)
 
+use std::convert::TryInto;
 use std::str::FromStr;
 use std::sync::Arc;
 
+use crate::catalog::TableReference;
 use crate::datasource::TableProvider;
 use crate::logical_plan::Expr::Alias;
 use crate::logical_plan::{
@@ -38,30 +40,31 @@ use crate::{
 };
 
 use arrow::datatypes::*;
+use hashbrown::HashMap;
 
 use crate::prelude::JoinType;
 use sqlparser::ast::{
     BinaryOperator, DataType as SQLDataType, DateTimeField, Expr as SQLExpr, FunctionArg,
-    Join, JoinConstraint, JoinOperator, Query, Select, SelectItem, SetExpr, SetOperator,
-    TableFactor, TableWithJoins, UnaryOperator, Value,
+    Ident, Join, JoinConstraint, JoinOperator, ObjectName, Query, Select, SelectItem,
+    SetExpr, SetOperator, TableFactor, TableWithJoins, UnaryOperator, Value,
 };
 use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption};
 use sqlparser::ast::{OrderByExpr, Statement};
 use sqlparser::parser::ParserError::ParserError;
 
-use super::utils::{
-    can_columns_satisfy_exprs, expand_wildcard, expr_as_column_expr, extract_aliases,
-    find_aggregate_exprs, find_column_exprs, rebase_expr, resolve_aliases_to_exprs,
+use super::{
+    parser::DFParser,
+    utils::{
+        can_columns_satisfy_exprs, expand_wildcard, expr_as_column_expr, extract_aliases,
+        find_aggregate_exprs, find_column_exprs, rebase_expr, resolve_aliases_to_exprs,
+    },
 };
 
 /// The ContextProvider trait allows the query planner to obtain meta-data about tables and
 /// functions referenced in SQL statements
 pub trait ContextProvider {
     /// Getter for a datasource
-    fn get_table_provider(
-        &self,
-        name: &str,
-    ) -> Option<Arc<dyn TableProvider + Send + Sync>>;
+    fn get_table_provider(&self, name: TableReference) -> Option<Arc<dyn TableProvider>>;
     /// Getter for a UDF description
     fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>>;
     /// Getter for a UDAF description
@@ -96,6 +99,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 analyze: _,
             } => self.explain_statement_to_plan(*verbose, &statement),
             Statement::Query(query) => self.query_to_plan(&query),
+            Statement::ShowVariable { variable } => self.show_variable_to_plan(&variable),
             _ => Err(DataFusionError::NotImplemented(
                 "Only SELECT statements are implemented".to_string(),
             )),
@@ -104,7 +108,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
 
     /// Generate a logic plan from an SQL query
     pub fn query_to_plan(&self, query: &Query) -> Result<LogicalPlan> {
-        self.query_to_plan_with_alias(query, None)
+        self.query_to_plan_with_alias(query, None, &mut HashMap::new())
     }
 
     /// Generate a logic plan from an SQL query with optional alias
@@ -112,9 +116,23 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         &self,
         query: &Query,
         alias: Option<String>,
+        ctes: &mut HashMap<String, LogicalPlan>,
     ) -> Result<LogicalPlan> {
         let set_expr = &query.body;
-        let plan = self.set_expr_to_plan(set_expr, alias)?;
+        if let Some(with) = &query.with {
+            // Process CTEs from top to bottom
+            // do not allow self-references
+            for cte in &with.cte_tables {
+                // create logical plan & pass backreferencing CTEs
+                let logical_plan = self.query_to_plan_with_alias(
+                    &cte.query,
+                    Some(cte.alias.name.value.clone()),
+                    &mut ctes.clone(),
+                )?;
+                ctes.insert(cte.alias.name.value.clone(), logical_plan);
+            }
+        }
+        let plan = self.set_expr_to_plan(set_expr, alias, ctes)?;
 
         let plan = self.order_by(&plan, &query.order_by)?;
 
@@ -125,9 +143,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         &self,
         set_expr: &SetExpr,
         alias: Option<String>,
+        ctes: &mut HashMap<String, LogicalPlan>,
     ) -> Result<LogicalPlan> {
         match set_expr {
-            SetExpr::Select(s) => self.select_to_plan(s.as_ref()),
+            SetExpr::Select(s) => self.select_to_plan(s.as_ref(), ctes),
             SetExpr::SetOperation {
                 op,
                 left,
@@ -135,8 +154,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 all,
             } => match (op, all) {
                 (SetOperator::Union, true) => {
-                    let left_plan = self.set_expr_to_plan(left.as_ref(), None)?;
-                    let right_plan = self.set_expr_to_plan(right.as_ref(), None)?;
+                    let left_plan = self.set_expr_to_plan(left.as_ref(), None, ctes)?;
+                    let right_plan = self.set_expr_to_plan(right.as_ref(), None, ctes)?;
                     let inputs = vec![left_plan, right_plan]
                         .into_iter()
                         .flat_map(|p| match p {
@@ -280,24 +299,32 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         }
     }
 
-    fn plan_from_tables(&self, from: &[TableWithJoins]) -> Result<Vec<LogicalPlan>> {
+    fn plan_from_tables(
+        &self,
+        from: &[TableWithJoins],
+        ctes: &mut HashMap<String, LogicalPlan>,
+    ) -> Result<Vec<LogicalPlan>> {
         match from.len() {
             0 => Ok(vec![LogicalPlanBuilder::empty(true).build()?]),
             _ => from
                 .iter()
-                .map(|t| self.plan_table_with_joins(t))
+                .map(|t| self.plan_table_with_joins(t, ctes))
                 .collect::<Result<Vec<_>>>(),
         }
     }
 
-    fn plan_table_with_joins(&self, t: &TableWithJoins) -> Result<LogicalPlan> {
-        let left = self.create_relation(&t.relation)?;
+    fn plan_table_with_joins(
+        &self,
+        t: &TableWithJoins,
+        ctes: &mut HashMap<String, LogicalPlan>,
+    ) -> Result<LogicalPlan> {
+        let left = self.create_relation(&t.relation, ctes)?;
         match t.joins.len() {
             0 => Ok(left),
             n => {
-                let mut left = self.parse_relation_join(&left, &t.joins[0])?;
+                let mut left = self.parse_relation_join(&left, &t.joins[0], ctes)?;
                 for i in 1..n {
-                    left = self.parse_relation_join(&left, &t.joins[i])?;
+                    left = self.parse_relation_join(&left, &t.joins[i], ctes)?;
                 }
                 Ok(left)
             }
@@ -308,8 +335,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         &self,
         left: &LogicalPlan,
         join: &Join,
+        ctes: &mut HashMap<String, LogicalPlan>,
     ) -> Result<LogicalPlan> {
-        let right = self.create_relation(&join.relation)?;
+        let right = self.create_relation(&join.relation, ctes)?;
         match &join.join_operator {
             JoinOperator::LeftOuter(constraint) => {
                 self.parse_join(left, &right, constraint, JoinType::Left)
@@ -372,16 +400,25 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         }
     }
 
-    fn create_relation(&self, relation: &TableFactor) -> Result<LogicalPlan> {
+    fn create_relation(
+        &self,
+        relation: &TableFactor,
+        ctes: &mut HashMap<String, LogicalPlan>,
+    ) -> Result<LogicalPlan> {
         match relation {
             TableFactor::Table { name, .. } => {
                 let table_name = name.to_string();
-                match self.schema_provider.get_table_provider(&table_name) {
-                    Some(provider) => {
+                let cte = ctes.get(&table_name);
+                match (
+                    cte,
+                    self.schema_provider.get_table_provider(name.try_into()?),
+                ) {
+                    (Some(cte_plan), _) => Ok(cte_plan.clone()),
+                    (_, Some(provider)) => {
                         LogicalPlanBuilder::scan(&table_name, provider, None)?.build()
                     }
-                    None => Err(DataFusionError::Plan(format!(
-                        "no provider found for table {}",
+                    (_, None) => Err(DataFusionError::Plan(format!(
+                        "Table or CTE with name '{}' not found",
                         name
                     ))),
                 }
@@ -391,9 +428,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             } => self.query_to_plan_with_alias(
                 subquery,
                 alias.as_ref().map(|a| a.name.value.to_string()),
+                ctes,
             ),
             TableFactor::NestedJoin(table_with_joins) => {
-                self.plan_table_with_joins(table_with_joins)
+                self.plan_table_with_joins(table_with_joins, ctes)
             }
             // @todo Support TableFactory::TableFunction?
             _ => Err(DataFusionError::NotImplemented(format!(
@@ -404,8 +442,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
     }
 
     /// Generate a logic plan from an SQL select
-    fn select_to_plan(&self, select: &Select) -> Result<LogicalPlan> {
-        let plans = self.plan_from_tables(&select.from)?;
+    fn select_to_plan(
+        &self,
+        select: &Select,
+        ctes: &mut HashMap<String, LogicalPlan>,
+    ) -> Result<LogicalPlan> {
+        let plans = self.plan_from_tables(&select.from, ctes)?;
 
         let plan = match &select.selection {
             Some(predicate_expr) => {
@@ -529,7 +571,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                     &select_exprs,
                     &having_expr_opt,
                     &select.group_by,
-                    &aggr_exprs,
+                    aggr_exprs,
                 )?
             } else {
                 if let Some(having_expr) = &having_expr_opt {
@@ -562,7 +604,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             plan
         };
 
-        self.project(&plan, &select_exprs_post_aggr, false)
+        self.project(&plan, select_exprs_post_aggr, false)
     }
 
     /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions.
@@ -593,7 +635,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
     fn project(
         &self,
         input: &LogicalPlan,
-        expr: &[Expr],
+        expr: Vec<Expr>,
         force: bool,
     ) -> Result<LogicalPlan> {
         self.validate_schema_satisfies_exprs(&input.schema(), &expr)?;
@@ -618,7 +660,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         select_exprs: &[Expr],
         having_expr_opt: &Option<Expr>,
         group_by: &[SQLExpr],
-        aggr_exprs: &[Expr],
+        aggr_exprs: Vec<Expr>,
     ) -> Result<(LogicalPlan, Vec<Expr>, Option<Expr>)> {
         let group_by_exprs = group_by
             .iter()
@@ -632,7 +674,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             .collect::<Vec<Expr>>();
 
         let plan = LogicalPlanBuilder::from(&input)
-            .aggregate(&group_by_exprs, aggr_exprs)?
+            .aggregate(group_by_exprs, aggr_exprs)?
             .build()?;
 
         // After aggregation, these are all of the columns that will be
@@ -719,9 +761,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             })
             .collect();
 
-        LogicalPlanBuilder::from(&plan)
-            .sort(&order_by_rex?)?
-            .build()
+        LogicalPlanBuilder::from(&plan).sort(order_by_rex?)?.build()
     }
 
     /// Validate the schema provides all of the columns referenced in the expressions.
@@ -1235,6 +1275,36 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         let result: i64 = (result_days << 32) | result_millis;
         Ok(Expr::Literal(ScalarValue::IntervalDayTime(Some(result))))
     }
+
+    fn show_variable_to_plan(&self, variable: &[Ident]) -> Result<LogicalPlan> {
+        // Special case SHOW TABLES
+        let variable = ObjectName(variable.to_vec()).to_string();
+        if variable.as_str().eq_ignore_ascii_case("tables") {
+            let tables_reference = TableReference::Partial {
+                schema: "information_schema",
+                table: "tables",
+            };
+            if self
+                .schema_provider
+                .get_table_provider(tables_reference)
+                .is_some()
+            {
+                let rewrite =
+                    DFParser::parse_sql("SELECT * FROM information_schema.tables;")?;
+                self.statement_to_plan(&rewrite[0])
+            } else {
+                Err(DataFusionError::Plan(
+                    "SHOW TABLES is not supported unless information_schema is enabled"
+                        .to_string(),
+                ))
+            }
+        } else {
+            Err(DataFusionError::NotImplemented(format!(
+                "SHOW {} not implemented. Supported syntax: SHOW <TABLES>",
+                variable
+            )))
+        }
+    }
 }
 
 /// Remove join expressions from a filter expression
@@ -2495,9 +2565,9 @@ mod tests {
     impl ContextProvider for MockContextProvider {
         fn get_table_provider(
             &self,
-            name: &str,
-        ) -> Option<Arc<dyn TableProvider + Send + Sync>> {
-            let schema = match name {
+            name: TableReference,
+        ) -> Option<Arc<dyn TableProvider>> {
+            let schema = match name.table() {
                 "person" => Some(Schema::new(vec![
                     Field::new("id", DataType::UInt32, false),
                     Field::new("first_name", DataType::Utf8, false),
@@ -2540,7 +2610,7 @@ mod tests {
                 ])),
                 _ => None,
             };
-            schema.map(|s| -> Arc<dyn TableProvider + Send + Sync> {
+            schema.map(|s| -> Arc<dyn TableProvider> {
                 Arc::new(EmptyTable::new(Arc::new(s)))
             })
         }
diff --git a/rust/datafusion/src/test/mod.rs b/rust/datafusion/src/test/mod.rs
index 75a956f1cf4..04f340a9936 100644
--- a/rust/datafusion/src/test/mod.rs
+++ b/rust/datafusion/src/test/mod.rs
@@ -29,7 +29,7 @@ use std::io::{BufReader, BufWriter};
 use std::sync::Arc;
 use tempfile::TempDir;
 
-pub fn create_table_dual() -> Arc<dyn TableProvider + Send + Sync> {
+pub fn create_table_dual() -> Arc<dyn TableProvider> {
     let dual_schema = Arc::new(Schema::new(vec![
         Field::new("id", DataType::Int32, false),
         Field::new("name", DataType::Utf8, false),
diff --git a/rust/datafusion/tests/custom_sources.rs b/rust/datafusion/tests/custom_sources.rs
index 0bc699ceffb..a00dd6ac282 100644
--- a/rust/datafusion/tests/custom_sources.rs
+++ b/rust/datafusion/tests/custom_sources.rs
@@ -162,7 +162,7 @@ async fn custom_source_dataframe() -> Result<()> {
 
     let table = ctx.read_table(Arc::new(CustomTableProvider))?;
     let logical_plan = LogicalPlanBuilder::from(&table.to_logical_plan())
-        .project(&[col("c2")])?
+        .project(vec![col("c2")])?
         .build()?;
 
     let optimized_plan = ctx.optimize(&logical_plan)?;
diff --git a/rust/datafusion/tests/dataframe.rs b/rust/datafusion/tests/dataframe.rs
index e0c698ed5fb..9d5f92a7753 100644
--- a/rust/datafusion/tests/dataframe.rs
+++ b/rust/datafusion/tests/dataframe.rs
@@ -61,11 +61,11 @@ async fn join() -> Result<()> {
     let table1 = MemTable::try_new(schema1, vec![vec![batch1]])?;
     let table2 = MemTable::try_new(schema2, vec![vec![batch2]])?;
 
-    ctx.register_table("aa", Arc::new(table1));
+    ctx.register_table("aa", Arc::new(table1))?;
 
     let df1 = ctx.table("aa")?;
 
-    ctx.register_table("aaa", Arc::new(table2));
+    ctx.register_table("aaa", Arc::new(table2))?;
 
     let df2 = ctx.table("aaa")?;
 
diff --git a/rust/datafusion/tests/provider_filter_pushdown.rs b/rust/datafusion/tests/provider_filter_pushdown.rs
index a64f7fb74fb..0bf67bea8b9 100644
--- a/rust/datafusion/tests/provider_filter_pushdown.rs
+++ b/rust/datafusion/tests/provider_filter_pushdown.rs
@@ -150,13 +150,13 @@ async fn assert_provider_row_count(value: i64, expected_count: u64) -> Result<()
     let df = ctx
         .read_table(Arc::new(provider.clone()))?
         .filter(col("flag").eq(lit(value)))?
-        .aggregate(&[], &[count(col("flag"))])?;
+        .aggregate(vec![], vec![count(col("flag"))])?;
 
     let results = df.collect().await?;
     let result_col: &UInt64Array = as_primitive_array(results[0].column(0));
     assert_eq!(result_col.value(0), expected_count);
 
-    ctx.register_table("data", Arc::new(provider));
+    ctx.register_table("data", Arc::new(provider))?;
     let sql_results = ctx
         .sql(&format!("select count(*) from data where flag = {}", value))?
         .collect()
diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs
index 1bce3f7c07c..8c2c35ef6f0 100644
--- a/rust/datafusion/tests/sql.rs
+++ b/rust/datafusion/tests/sql.rs
@@ -18,6 +18,9 @@
 use std::convert::TryFrom;
 use std::sync::Arc;
 
+use chrono::prelude::*;
+use chrono::Duration;
+
 extern crate arrow;
 extern crate datafusion;
 
@@ -35,7 +38,10 @@ use datafusion::{
     datasource::{csv::CsvReadOptions, MemTable},
     physical_plan::collect,
 };
-use datafusion::{error::Result, physical_plan::ColumnarValue};
+use datafusion::{
+    error::{DataFusionError, Result},
+    physical_plan::ColumnarValue,
+};
 
 #[tokio::test]
 async fn nyc() -> Result<()> {
@@ -1168,7 +1174,7 @@ fn create_case_context() -> Result<ExecutionContext> {
         ]))],
     )?;
     let table = MemTable::try_new(schema, vec![vec![data]])?;
-    ctx.register_table("t1", Arc::new(table));
+    ctx.register_table("t1", Arc::new(table))?;
     Ok(ctx)
 }
 
@@ -1317,7 +1323,7 @@ fn create_join_context(
         ],
     )?;
     let t1_table = MemTable::try_new(t1_schema, vec![vec![t1_data]])?;
-    ctx.register_table("t1", Arc::new(t1_table));
+    ctx.register_table("t1", Arc::new(t1_table))?;
 
     let t2_schema = Arc::new(Schema::new(vec![
         Field::new(column_right, DataType::UInt32, true),
@@ -1336,7 +1342,7 @@ fn create_join_context(
         ],
     )?;
     let t2_table = MemTable::try_new(t2_schema, vec![vec![t2_data]])?;
-    ctx.register_table("t2", Arc::new(t2_table));
+    ctx.register_table("t2", Arc::new(t2_table))?;
 
     Ok(ctx)
 }
@@ -1358,7 +1364,7 @@ fn create_join_context_qualified() -> Result<ExecutionContext> {
         ],
     )?;
     let t1_table = MemTable::try_new(t1_schema, vec![vec![t1_data]])?;
-    ctx.register_table("t1", Arc::new(t1_table));
+    ctx.register_table("t1", Arc::new(t1_table))?;
 
     let t2_schema = Arc::new(Schema::new(vec![
         Field::new("a", DataType::UInt32, true),
@@ -1374,7 +1380,7 @@ fn create_join_context_qualified() -> Result<ExecutionContext> {
         ],
     )?;
     let t2_table = MemTable::try_new(t2_schema, vec![vec![t2_data]])?;
-    ctx.register_table("t2", Arc::new(t2_table));
+    ctx.register_table("t2", Arc::new(t2_table))?;
 
     Ok(ctx)
 }
@@ -1594,7 +1600,7 @@ async fn generic_query_length<T: 'static + Array + From<Vec<&'static str>>>(
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT length(c1) FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["0"], vec!["1"], vec!["2"], vec!["3"]];
@@ -1630,7 +1636,7 @@ async fn query_not() -> Result<()> {
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT NOT c1 FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["true"], vec!["NULL"], vec!["false"]];
@@ -1656,7 +1662,7 @@ async fn query_concat() -> Result<()> {
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT concat(c1, '-hi-', cast(c2 as varchar)) FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![
@@ -1687,7 +1693,7 @@ async fn query_array() -> Result<()> {
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT array(c1, cast(c2 as varchar)) FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![
@@ -1780,7 +1786,7 @@ fn make_timestamp_nano_table() -> Result<Arc<MemTable>> {
 #[tokio::test]
 async fn to_timestamp() -> Result<()> {
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("ts_data", make_timestamp_nano_table()?);
+    ctx.register_table("ts_data", make_timestamp_nano_table()?)?;
 
     let sql = "SELECT COUNT(*) FROM ts_data where ts > to_timestamp('2020-09-08T12:00:00+00:00')";
     let actual = execute(&mut ctx, sql).await;
@@ -1806,7 +1812,7 @@ async fn query_is_null() -> Result<()> {
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT c1 IS NULL FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["false"], vec!["true"], vec!["false"]];
@@ -1830,7 +1836,7 @@ async fn query_is_not_null() -> Result<()> {
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT c1 IS NOT NULL FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["true"], vec!["false"], vec!["true"]];
@@ -1857,7 +1863,7 @@ async fn query_count_distinct() -> Result<()> {
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT COUNT(DISTINCT c1) FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["3".to_string()]];
@@ -1886,7 +1892,7 @@ async fn query_on_string_dictionary() -> Result<()> {
 
     let table = MemTable::try_new(schema, vec![vec![data]])?;
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
 
     // Basic SELECT
     let sql = "SELECT * FROM test";
@@ -1940,6 +1946,73 @@ async fn query_without_from() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn query_cte() -> Result<()> {
+    // Test for SELECT <expression> without FROM.
+    // Should evaluate expressions in project position.
+    let mut ctx = ExecutionContext::new();
+
+    // simple with
+    let sql = "WITH t AS (SELECT 1) SELECT * FROM t";
+    let actual = execute(&mut ctx, sql).await;
+    let expected = vec![vec!["1"]];
+    assert_eq!(expected, actual);
+
+    // with + union
+    let sql = "WITH t AS (SELECT 1 AS a), u AS (SELECT 2 AS a) SELECT * FROM t UNION ALL SELECT * FROM u";
+    let actual = execute(&mut ctx, sql).await;
+    let expected = vec![vec!["1"], vec!["2"]];
+    assert_eq!(expected, actual);
+
+    // with + join
+    let sql = "WITH t AS (SELECT 1 AS id1), u AS (SELECT 1 AS id2, 5 as x) SELECT x FROM t JOIN u ON (id1 = id2)";
+    let actual = execute(&mut ctx, sql).await;
+    let expected = vec![vec!["5"]];
+    assert_eq!(expected, actual);
+
+    // backward reference
+    let sql = "WITH t AS (SELECT 1 AS id1), u AS (SELECT * FROM t) SELECT * from u";
+    let actual = execute(&mut ctx, sql).await;
+    let expected = vec![vec!["1"]];
+    assert_eq!(expected, actual);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn query_cte_incorrect() -> Result<()> {
+    let ctx = ExecutionContext::new();
+
+    // self reference
+    let sql = "WITH t AS (SELECT * FROM t) SELECT * from u";
+    let plan = ctx.create_logical_plan(&sql);
+    assert!(plan.is_err());
+    assert_eq!(
+        format!("{}", plan.unwrap_err()),
+        "Error during planning: Table or CTE with name \'t\' not found"
+    );
+
+    // forward referencing
+    let sql = "WITH t AS (SELECT * FROM u), u AS (SELECT 1) SELECT * from u";
+    let plan = ctx.create_logical_plan(&sql);
+    assert!(plan.is_err());
+    assert_eq!(
+        format!("{}", plan.unwrap_err()),
+        "Error during planning: Table or CTE with name \'u\' not found"
+    );
+
+    // wrapping should hide u
+    let sql = "WITH t AS (WITH u as (SELECT 1) SELECT 1) SELECT * from u";
+    let plan = ctx.create_logical_plan(&sql);
+    assert!(plan.is_err());
+    assert_eq!(
+        format!("{}", plan.unwrap_err()),
+        "Error during planning: Table or CTE with name \'u\' not found"
+    );
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn query_scalar_minus_array() -> Result<()> {
     let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
@@ -1957,7 +2030,7 @@ async fn query_scalar_minus_array() -> Result<()> {
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
     let mut ctx = ExecutionContext::new();
-    ctx.register_table("test", Arc::new(table));
+    ctx.register_table("test", Arc::new(table))?;
     let sql = "SELECT 4 - c1 FROM test";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["4"], vec!["3"], vec!["NULL"], vec!["1"]];
@@ -2036,7 +2109,7 @@ async fn csv_group_by_date() -> Result<()> {
     )?;
     let table = MemTable::try_new(schema, vec![vec![data]])?;
 
-    ctx.register_table("dates", Arc::new(table));
+    ctx.register_table("dates", Arc::new(table))?;
     let sql = "SELECT SUM(cnt) FROM dates GROUP BY date";
     let actual = execute(&mut ctx, sql).await;
     let mut actual: Vec<String> = actual.iter().flatten().cloned().collect();
@@ -2046,6 +2119,47 @@ async fn csv_group_by_date() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn group_by_timestamp_millis() -> Result<()> {
+    let mut ctx = ExecutionContext::new();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(
+            "timestamp",
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            false,
+        ),
+        Field::new("count", DataType::Int32, false),
+    ]));
+    let base_dt = Utc.ymd(2018, 7, 1).and_hms(6, 0, 0); // 2018-Jul-01 06:00
+    let hour1 = Duration::hours(1);
+    let timestamps = vec![
+        base_dt.timestamp_millis(),
+        (base_dt + hour1).timestamp_millis(),
+        base_dt.timestamp_millis(),
+        base_dt.timestamp_millis(),
+        (base_dt + hour1).timestamp_millis(),
+        (base_dt + hour1).timestamp_millis(),
+    ];
+    let data = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(TimestampMillisecondArray::from(timestamps)),
+            Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50, 60])),
+        ],
+    )?;
+    let t1_table = MemTable::try_new(schema, vec![vec![data]])?;
+    ctx.register_table("t1", Arc::new(t1_table)).unwrap();
+
+    let sql =
+        "SELECT timestamp, SUM(count) FROM t1 GROUP BY timestamp ORDER BY timestamp ASC";
+    let actual = execute(&mut ctx, sql).await;
+    let actual: Vec<String> = actual.iter().map(|row| row[1].clone()).collect();
+    let expected = vec!["80", "130"];
+    assert_eq!(expected, actual);
+    Ok(())
+}
+
 macro_rules! test_expression {
     ($SQL:expr, $EXPECTED:expr) => {
         let mut ctx = ExecutionContext::new();
@@ -2446,6 +2560,17 @@ async fn test_in_list_scalar() -> Result<()> {
     test_expression!("'2' IN ('a','b',NULL,1)", "NULL");
     test_expression!("'1' NOT IN ('a','b',NULL,1)", "false");
     test_expression!("'2' NOT IN ('a','b',NULL,1)", "NULL");
+    test_expression!("regexp_match('foobarbequebaz', '')", "[]");
+    test_expression!(
+        "regexp_match('foobarbequebaz', '(bar)(beque)')",
+        "[bar, beque]"
+    );
+    test_expression!("regexp_match('foobarbequebaz', '(ba3r)(bequ34e)')", "NULL");
+    test_expression!("regexp_match('aaa-0', '.*-(\\d)')", "[0]");
+    test_expression!("regexp_match('bb-1', '.*-(\\d)')", "[1]");
+    test_expression!("regexp_match('aa', '.*-(\\d)')", "NULL");
+    test_expression!("regexp_match(NULL, '.*-(\\d)')", "NULL");
+    test_expression!("regexp_match('aaa-0', NULL)", "NULL");
     Ok(())
 }
 
@@ -2504,3 +2629,36 @@ async fn inner_join_qualified_names() -> Result<()> {
     }
     Ok(())
 }
+
+#[tokio::test]
+async fn qualified_table_references() -> Result<()> {
+    let mut ctx = ExecutionContext::new();
+    register_aggregate_csv(&mut ctx)?;
+
+    for table_ref in &[
+        "aggregate_test_100",
+        "public.aggregate_test_100",
+        "datafusion.public.aggregate_test_100",
+    ] {
+        let sql = format!("SELECT COUNT(*) FROM {}", table_ref);
+        let results = execute(&mut ctx, &sql).await;
+        assert_eq!(results, vec![vec!["100"]]);
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn invalid_qualified_table_references() -> Result<()> {
+    let mut ctx = ExecutionContext::new();
+    register_aggregate_csv(&mut ctx)?;
+
+    for table_ref in &[
+        "nonexistentschema.aggregate_test_100",
+        "nonexistentcatalog.public.aggregate_test_100",
+        "way.too.many.namespaces.as.ident.prefixes.aggregate_test_100",
+    ] {
+        let sql = format!("SELECT COUNT(*) FROM {}", table_ref);
+        assert!(matches!(ctx.sql(&sql), Err(DataFusionError::Plan(_))));
+    }
+    Ok(())
+}
diff --git a/rust/parquet/Cargo.toml b/rust/parquet/Cargo.toml
index f7be4eea9a0..d449cad30af 100644
--- a/rust/parquet/Cargo.toml
+++ b/rust/parquet/Cargo.toml
@@ -47,6 +47,7 @@ clap = { version = "2.33.3", optional = true }
 serde_json = { version = "1.0", features = ["preserve_order"], optional = true }
 
 [dev-dependencies]
+criterion = "0.3"
 rand = "0.8"
 snap = "1.0"
 brotli = "3.3"
@@ -71,3 +72,7 @@ required-features = ["cli"]
 [[ bin ]]
 name = "parquet-rowcount"
 required-features = ["cli"]
+
+[[bench]]
+name = "arrow_writer"
+harness = false
diff --git a/rust/parquet/README.md b/rust/parquet/README.md
index 0261a0e95a4..6abbbc7aaee 100644
--- a/rust/parquet/README.md
+++ b/rust/parquet/README.md
@@ -50,14 +50,18 @@ See [crate documentation](https://docs.rs/crate/parquet/4.0.0-SNAPSHOT) on avail
 
 If you are upgrading from version 3.0 or previous of this crate, you
 likely need to change your code to use [`ConvertedType`] rather than
-[`LogicalType`]. Version 4.0 introduces an *entirely new* struct
-called `LogicalType` to align with the `LogicalType` introduced in
-Parquet Format 2.4.0. The type previously called `LogicalType` was was
-renamed to `ConvertedType`.
+[`LogicalType`] to preserve existing behaviour in your code.
 
+Version 2.4.0 of the Parquet format introduced a `LogicalType` to replace the existing `ConvertedType`.
+This crate used `parquet::basic::LogicalType` to map to the `ConvertedType`, but this has been renamed to `parquet::basic::ConvertedType` from version 4.0 of this crate.
+
+The `ConvertedType` is deprecated in the format, but is still written
+to preserve backward compatibility.
+It is preferred that `LogicalType` is used, as it supports nanosecond
+precision timestamps without using the deprecated `Int96` Parquet type.
 
 ## Supported Parquet Version
-- Parquet-format 2.4.0
+- Parquet-format 2.6.0
 
 To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs)
 version is available. Then simply update version of `parquet-format` crate in Cargo.toml.
@@ -73,9 +77,9 @@ version is available. Then simply update version of `parquet-format` crate in Ca
 - [X] Write support
   - [X] Primitive column value writers
   - [ ] Row record writer
-  - [ ] Arrow record writer
+  - [X] Arrow record writer
 - [ ] Predicate pushdown
-- [ ] Parquet format 2.5 support
+- [X] Parquet format 2.6.0 support
 
 ## Requirements
 
diff --git a/rust/parquet/benches/arrow_writer.rs b/rust/parquet/benches/arrow_writer.rs
new file mode 100644
index 00000000000..069ed39d103
--- /dev/null
+++ b/rust/parquet/benches/arrow_writer.rs
@@ -0,0 +1,202 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[macro_use]
+extern crate criterion;
+use criterion::{Criterion, Throughput};
+
+extern crate arrow;
+extern crate parquet;
+
+use std::sync::Arc;
+
+use arrow::datatypes::*;
+use arrow::{record_batch::RecordBatch, util::data_gen::*};
+use parquet::{
+    arrow::ArrowWriter, errors::Result, file::writer::InMemoryWriteableCursor,
+};
+
+fn create_primitive_bench_batch(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![
+        Field::new("_1", DataType::Int8, true),
+        Field::new("_2", DataType::Int16, true),
+        Field::new("_3", DataType::Int32, true),
+        Field::new("_4", DataType::Int64, true),
+        Field::new("_5", DataType::UInt8, true),
+        Field::new("_6", DataType::UInt16, true),
+        Field::new("_7", DataType::UInt32, true),
+        Field::new("_8", DataType::UInt64, true),
+        Field::new("_9", DataType::Float32, true),
+        Field::new("_10", DataType::Float64, true),
+        Field::new("_11", DataType::Date32, true),
+        Field::new("_12", DataType::Date64, true),
+        Field::new("_13", DataType::Time32(TimeUnit::Second), true),
+        Field::new("_14", DataType::Time32(TimeUnit::Millisecond), true),
+        Field::new("_15", DataType::Time64(TimeUnit::Microsecond), true),
+        Field::new("_16", DataType::Time64(TimeUnit::Nanosecond), true),
+        Field::new("_17", DataType::Utf8, true),
+        Field::new("_18", DataType::LargeUtf8, true),
+        Field::new("_19", DataType::Boolean, true),
+    ];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn _create_nested_bench_batch(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![
+        Field::new(
+            "_1",
+            DataType::Struct(vec![
+                Field::new("_1", DataType::Int8, true),
+                Field::new(
+                    "_2",
+                    DataType::Struct(vec![
+                        Field::new("_1", DataType::Int8, true),
+                        Field::new(
+                            "_1",
+                            DataType::Struct(vec![
+                                Field::new("_1", DataType::Int8, true),
+                                Field::new("_2", DataType::Utf8, true),
+                            ]),
+                            true,
+                        ),
+                        Field::new("_2", DataType::UInt8, true),
+                    ]),
+                    true,
+                ),
+            ]),
+            true,
+        ),
+        Field::new(
+            "_2",
+            DataType::LargeList(Box::new(Field::new(
+                "item",
+                DataType::List(Box::new(Field::new(
+                    "item",
+                    DataType::Struct(vec![
+                        Field::new(
+                            "_1",
+                            DataType::Struct(vec![
+                                Field::new("_1", DataType::Int8, true),
+                                Field::new("_2", DataType::Int16, true),
+                                Field::new("_3", DataType::Int32, true),
+                            ]),
+                            true,
+                        ),
+                        Field::new(
+                            "_2",
+                            DataType::List(Box::new(Field::new(
+                                "",
+                                DataType::FixedSizeBinary(2),
+                                true,
+                            ))),
+                            true,
+                        ),
+                    ]),
+                    true,
+                ))),
+                true,
+            ))),
+            true,
+        ),
+    ];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+#[inline]
+fn write_batch(batch: &RecordBatch) -> Result<()> {
+    // Write batch to an in-memory writer
+    let cursor = InMemoryWriteableCursor::default();
+    let mut writer = ArrowWriter::try_new(cursor, batch.schema(), None)?;
+
+    writer.write(&batch)?;
+    writer.close()?;
+    Ok(())
+}
+
+fn bench_primitive_writer(c: &mut Criterion) {
+    let batch = create_primitive_bench_batch(1024, 0.25, 0.75).unwrap();
+    let mut group = c.benchmark_group("write_batch primitive");
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("1024 values", |b| b.iter(|| write_batch(&batch).unwrap()));
+
+    let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("4096 values", |b| b.iter(|| write_batch(&batch).unwrap()));
+
+    group.finish();
+}
+
+// This bench triggers a write error, it is ignored for now
+fn _bench_nested_writer(c: &mut Criterion) {
+    let batch = _create_nested_bench_batch(1024, 0.25, 0.75).unwrap();
+    let mut group = c.benchmark_group("write_batch nested");
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("1024 values", |b| b.iter(|| write_batch(&batch).unwrap()));
+
+    let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("4096 values", |b| b.iter(|| write_batch(&batch).unwrap()));
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_primitive_writer);
+criterion_main!(benches);
diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs
index 3fae72dcdd7..a906147a8f9 100644
--- a/rust/parquet/src/arrow/array_reader.rs
+++ b/rust/parquet/src/arrow/array_reader.rs
@@ -1206,7 +1206,7 @@ impl<'a> TypeVisitor<Option<Box<dyn ArrayReader>>, &'a ArrayReaderBuilderContext
         &mut self,
         cur_type: Arc<Type>,
         context: &'a ArrayReaderBuilderContext,
-    ) -> Result<Option<Box<ArrayReader>>> {
+    ) -> Result<Option<Box<dyn ArrayReader>>> {
         let mut new_context = context.clone();
         new_context.path.append(vec![cur_type.name().to_string()]);
 
@@ -2289,7 +2289,7 @@ mod tests {
     }
 
     impl ArrayReader for InMemoryArrayReader {
-        fn as_any(&self) -> &Any {
+        fn as_any(&self) -> &dyn Any {
             self
         }
 
diff --git a/rust/parquet/src/arrow/arrow_reader.rs b/rust/parquet/src/arrow/arrow_reader.rs
index 7bbe8de1d64..83fb0a2f7e9 100644
--- a/rust/parquet/src/arrow/arrow_reader.rs
+++ b/rust/parquet/src/arrow/arrow_reader.rs
@@ -578,7 +578,7 @@ mod tests {
         values: &[Vec<T::T>],
         path: &Path,
         schema: TypePtr,
-    ) -> Result<()> {
+    ) -> Result<parquet_format::FileMetaData> {
         let file = File::create(path)?;
         let writer_props = Arc::new(WriterProperties::builder().build());
 
diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs
index 1ce907f81c1..5716aaeacb7 100644
--- a/rust/parquet/src/arrow/arrow_writer.rs
+++ b/rust/parquet/src/arrow/arrow_writer.rs
@@ -99,7 +99,7 @@ impl<W: 'static + ParquetWriter> ArrowWriter<W> {
     }
 
     /// Close and finalize the underlying Parquet writer
-    pub fn close(&mut self) -> Result<()> {
+    pub fn close(&mut self) -> Result<parquet_format::FileMetaData> {
         self.writer.close()
     }
 }
@@ -146,7 +146,8 @@ fn write_leaves(
         | ArrowDataType::Binary
         | ArrowDataType::Utf8
         | ArrowDataType::LargeUtf8
-        | ArrowDataType::Decimal(_, _) => {
+        | ArrowDataType::Decimal(_, _)
+        | ArrowDataType::FixedSizeBinary(_) => {
             let mut col_writer = get_col_writer(&mut row_group_writer)?;
             write_leaf(
                 &mut col_writer,
@@ -189,11 +190,14 @@ fn write_leaves(
         ArrowDataType::Float16 => Err(ParquetError::ArrowError(
             "Float16 arrays not supported".to_string(),
         )),
-        ArrowDataType::FixedSizeList(_, _)
-        | ArrowDataType::FixedSizeBinary(_)
-        | ArrowDataType::Union(_) => Err(ParquetError::NYI(
-            "Attempting to write an Arrow type that is not yet implemented".to_string(),
-        )),
+        ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_) => {
+            Err(ParquetError::NYI(
+                format!(
+                    "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", 
+                    array.data_type()
+                )
+            ))
+        }
     }
 }
 
@@ -1224,6 +1228,18 @@ mod tests {
         );
     }
 
+    #[test]
+    fn fixed_size_binary_single_column() {
+        let mut builder = FixedSizeBinaryBuilder::new(16, 4);
+        builder.append_value(b"0123").unwrap();
+        builder.append_null().unwrap();
+        builder.append_value(b"8910").unwrap();
+        builder.append_value(b"1112").unwrap();
+        let array = Arc::new(builder.finish());
+
+        one_column_roundtrip("timestamp_millisecond_single_column", array, true);
+    }
+
     #[test]
     fn string_single_column() {
         let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i| i.to_string()).collect();
diff --git a/rust/parquet/src/arrow/levels.rs b/rust/parquet/src/arrow/levels.rs
index 641e330522d..2168670bb59 100644
--- a/rust/parquet/src/arrow/levels.rs
+++ b/rust/parquet/src/arrow/levels.rs
@@ -136,7 +136,8 @@ impl LevelInfo {
             | DataType::Interval(_)
             | DataType::Binary
             | DataType::LargeBinary
-            | DataType::Decimal(_, _) => {
+            | DataType::Decimal(_, _)
+            | DataType::FixedSizeBinary(_) => {
                 // we return a vector of 1 value to represent the primitive
                 vec![self.calculate_child_levels(
                     array_offsets,
@@ -145,7 +146,6 @@ impl LevelInfo {
                     field.is_nullable(),
                 )]
             }
-            DataType::FixedSizeBinary(_) => unimplemented!(),
             DataType::List(list_field) | DataType::LargeList(list_field) => {
                 // Calculate the list level
                 let list_level = self.calculate_child_levels(
@@ -189,7 +189,8 @@ impl LevelInfo {
                     | DataType::Utf8
                     | DataType::LargeUtf8
                     | DataType::Dictionary(_, _)
-                    | DataType::Decimal(_, _) => {
+                    | DataType::Decimal(_, _)
+                    | DataType::FixedSizeBinary(_) => {
                         vec![list_level.calculate_child_levels(
                             child_offsets,
                             child_mask,
@@ -197,7 +198,6 @@ impl LevelInfo {
                             list_field.is_nullable(),
                         )]
                     }
-                    DataType::FixedSizeBinary(_) => unimplemented!(),
                     DataType::List(_) | DataType::LargeList(_) | DataType::Struct(_) => {
                         list_level.calculate_array_levels(&child_array, list_field)
                     }
@@ -297,9 +297,10 @@ impl LevelInfo {
         is_list: bool,
         is_nullable: bool,
     ) -> Self {
-        let mut definition = vec![];
-        let mut repetition = vec![];
-        let mut merged_array_mask = vec![];
+        let min_len = *(array_offsets.last().unwrap()) as usize;
+        let mut definition = Vec::with_capacity(min_len);
+        let mut repetition = Vec::with_capacity(min_len);
+        let mut merged_array_mask = Vec::with_capacity(min_len);
 
         // determine the total level increment based on data types
         let max_definition = match is_list {
@@ -624,9 +625,18 @@ impl LevelInfo {
                 let masks = offsets.windows(2).map(|w| w[1] > w[0]).collect();
                 (offsets, masks)
             }
-            DataType::FixedSizeBinary(_)
-            | DataType::FixedSizeList(_, _)
-            | DataType::Union(_) => {
+            DataType::FixedSizeBinary(value_len) => {
+                let array_mask = match array.data().null_buffer() {
+                    Some(buf) => get_bool_array_slice(buf, array.offset(), array.len()),
+                    None => vec![true; array.len()],
+                };
+                let value_len = *value_len as i64;
+                (
+                    (0..=(array.len() as i64)).map(|v| v * value_len).collect(),
+                    array_mask,
+                )
+            }
+            DataType::FixedSizeList(_, _) | DataType::Union(_) => {
                 unimplemented!("Getting offsets not yet implemented")
             }
         }
diff --git a/rust/parquet/src/arrow/mod.rs b/rust/parquet/src/arrow/mod.rs
index 9095259163f..b1aa39ebafa 100644
--- a/rust/parquet/src/arrow/mod.rs
+++ b/rust/parquet/src/arrow/mod.rs
@@ -53,7 +53,7 @@ pub(in crate::arrow) mod array_reader;
 pub mod arrow_reader;
 pub mod arrow_writer;
 pub(in crate::arrow) mod converter;
-pub mod levels;
+pub(in crate::arrow) mod levels;
 pub(in crate::arrow) mod record_reader;
 pub mod schema;
 
diff --git a/rust/parquet/src/arrow/record_reader.rs b/rust/parquet/src/arrow/record_reader.rs
index d58d563621f..7e3b6a847e7 100644
--- a/rust/parquet/src/arrow/record_reader.rs
+++ b/rust/parquet/src/arrow/record_reader.rs
@@ -82,7 +82,7 @@ impl<T: DataType> RecordReader<T> {
     }
 
     /// Set the current page reader.
-    pub fn set_page_reader(&mut self, page_reader: Box<PageReader>) -> Result<()> {
+    pub fn set_page_reader(&mut self, page_reader: Box<dyn PageReader>) -> Result<()> {
         self.column_reader =
             Some(ColumnReaderImpl::new(self.column_desc.clone(), page_reader));
         Ok(())
@@ -447,7 +447,7 @@ mod tests {
     use std::sync::Arc;
 
     struct TestPageReader {
-        pages: Box<Iterator<Item = Page>>,
+        pages: Box<dyn Iterator<Item = Page>>,
     }
 
     impl TestPageReader {
diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs
index fe9f60666bc..b15bb7e4140 100644
--- a/rust/parquet/src/arrow/schema.rs
+++ b/rust/parquet/src/arrow/schema.rs
@@ -942,11 +942,14 @@ mod tests {
             REQUIRED BOOLEAN boolean;
             REQUIRED INT32   int8  (INT_8);
             REQUIRED INT32   int16 (INT_16);
+            REQUIRED INT32   uint8 (INTEGER(8,false));
+            REQUIRED INT32   uint16 (INTEGER(16,false));
             REQUIRED INT32   int32;
             REQUIRED INT64   int64 ;
             OPTIONAL DOUBLE  double;
             OPTIONAL FLOAT   float;
             OPTIONAL BINARY  string (UTF8);
+            OPTIONAL BINARY  string_2 (STRING);
         }
         ";
         let parquet_group_type = parse_message_type(message_type).unwrap();
@@ -959,11 +962,14 @@ mod tests {
             Field::new("boolean", DataType::Boolean, false),
             Field::new("int8", DataType::Int8, false),
             Field::new("int16", DataType::Int16, false),
+            Field::new("uint8", DataType::UInt8, false),
+            Field::new("uint16", DataType::UInt16, false),
             Field::new("int32", DataType::Int32, false),
             Field::new("int64", DataType::Int64, false),
             Field::new("double", DataType::Float64, true),
             Field::new("float", DataType::Float32, true),
             Field::new("string", DataType::Utf8, true),
+            Field::new("string_2", DataType::Utf8, true),
         ];
 
         assert_eq!(&arrow_fields, converted_arrow_schema.fields());
@@ -1508,9 +1514,11 @@ mod tests {
         message test_schema {
             REQUIRED BOOLEAN boolean;
             REQUIRED INT32   int8  (INT_8);
+            REQUIRED INT32   uint8 (INTEGER(8,false));
             REQUIRED INT32   int16 (INT_16);
+            REQUIRED INT32   uint16 (INTEGER(16,false));
             REQUIRED INT32   int32;
-            REQUIRED INT64   int64 ;
+            REQUIRED INT64   int64;
             OPTIONAL DOUBLE  double;
             OPTIONAL FLOAT   float;
             OPTIONAL BINARY  string (UTF8);
@@ -1518,8 +1526,10 @@ mod tests {
             OPTIONAL INT32   date       (DATE);
             OPTIONAL INT32   time_milli (TIME_MILLIS);
             OPTIONAL INT64   time_micro (TIME_MICROS);
+            OPTIONAL INT64   time_nano (TIME(NANOS,false));
             OPTIONAL INT64   ts_milli (TIMESTAMP_MILLIS);
             REQUIRED INT64   ts_micro (TIMESTAMP_MICROS);
+            REQUIRED INT64   ts_nano (TIMESTAMP(NANOS,true));
         }
         ";
         let parquet_group_type = parse_message_type(message_type).unwrap();
@@ -1534,7 +1544,9 @@ mod tests {
         let arrow_fields = vec![
             Field::new("boolean", DataType::Boolean, false),
             Field::new("int8", DataType::Int8, false),
+            Field::new("uint8", DataType::UInt8, false),
             Field::new("int16", DataType::Int16, false),
+            Field::new("uint16", DataType::UInt16, false),
             Field::new("int32", DataType::Int32, false),
             Field::new("int64", DataType::Int64, false),
             Field::new("double", DataType::Float64, true),
@@ -1548,6 +1560,7 @@ mod tests {
             Field::new("date", DataType::Date32, true),
             Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
             Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
+            Field::new("time_nano", DataType::Time64(TimeUnit::Nanosecond), true),
             Field::new(
                 "ts_milli",
                 DataType::Timestamp(TimeUnit::Millisecond, None),
@@ -1558,24 +1571,28 @@ mod tests {
                 DataType::Timestamp(TimeUnit::Microsecond, None),
                 false,
             ),
+            Field::new(
+                "ts_nano",
+                DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_string())),
+                false,
+            ),
         ];
 
         assert_eq!(arrow_fields, converted_arrow_fields);
     }
 
     #[test]
-    #[ignore = "To be addressed as part of ARROW-11365"]
     fn test_field_to_column_desc() {
         let message_type = "
         message arrow_schema {
             REQUIRED BOOLEAN boolean;
             REQUIRED INT32   int8  (INT_8);
-            REQUIRED INT32   int16 (INT_16);
+            REQUIRED INT32   int16 (INTEGER(16,true));
             REQUIRED INT32   int32;
             REQUIRED INT64   int64;
             OPTIONAL DOUBLE  double;
             OPTIONAL FLOAT   float;
-            OPTIONAL BINARY  string (UTF8);
+            OPTIONAL BINARY  string (STRING);
             OPTIONAL GROUP   bools (LIST) {
                 REPEATED GROUP list {
                     OPTIONAL BOOLEAN element;
@@ -1587,20 +1604,20 @@ mod tests {
                 }
             }
             OPTIONAL INT32   date       (DATE);
-            OPTIONAL INT32   time_milli (TIME_MILLIS);
+            OPTIONAL INT32   time_milli (TIME(MILLIS,false));
             OPTIONAL INT64   time_micro (TIME_MICROS);
             OPTIONAL INT64   ts_milli (TIMESTAMP_MILLIS);
-            REQUIRED INT64   ts_micro (TIMESTAMP_MICROS);
+            REQUIRED INT64   ts_micro (TIMESTAMP(MICROS,false));
             REQUIRED GROUP struct {
                 REQUIRED BOOLEAN bools;
-                REQUIRED INT32 uint32 (UINT_32);
+                REQUIRED INT32 uint32 (INTEGER(32,false));
                 REQUIRED GROUP   int32 (LIST) {
                     REPEATED GROUP list {
                         OPTIONAL INT32 element;
                     }
                 }
             }
-            REQUIRED BINARY  dictionary_strings (UTF8);
+            REQUIRED BINARY  dictionary_strings (STRING);
         }
         ";
         let parquet_group_type = parse_message_type(message_type).unwrap();
@@ -1674,8 +1691,20 @@ mod tests {
             .iter()
             .zip(converted_arrow_schema.columns())
             .for_each(|(a, b)| {
-                // TODO: ARROW-11365: If parsing v1 format, there should be no logical type
-                assert_eq!(a, b);
+                // Only check logical type if it's set on the Parquet side.
+                // This is because the Arrow conversion always sets logical type,
+                // even if there wasn't originally one.
+                // This is not an issue, but is an inconvenience for this test.
+                match a.logical_type() {
+                    Some(_) => {
+                        assert_eq!(a, b)
+                    }
+                    None => {
+                        assert_eq!(a.name(), b.name());
+                        assert_eq!(a.physical_type(), b.physical_type());
+                        assert_eq!(a.converted_type(), b.converted_type());
+                    }
+                };
             });
     }
 
@@ -1694,7 +1723,7 @@ mod tests {
     fn test_metadata() {
         let message_type = "
         message test_schema {
-            OPTIONAL BINARY  string (UTF8);
+            OPTIONAL BINARY  string (STRING);
         }
         ";
         let parquet_group_type = parse_message_type(message_type).unwrap();
diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs
index 9e37516b8fb..631257e0ed1 100644
--- a/rust/parquet/src/basic.rs
+++ b/rust/parquet/src/basic.rs
@@ -60,7 +60,8 @@ pub enum Type {
 /// This helps map between types in those frameworks to the base types in Parquet.
 /// This is only metadata and not needed to read or write the data.
 ///
-/// *Upgrade Note*: This struct was renamed from `LogicalType` in version 4.0.0.
+/// This struct was renamed from `LogicalType` in version 4.0.0.
+/// If targeting Parquet format 2.4.0 or above, please use [LogicalType] instead.
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum ConvertedType {
     NONE,
@@ -157,9 +158,9 @@ pub enum ConvertedType {
 // ----------------------------------------------------------------------
 // Mirrors `parquet::LogicalType`
 
-/// Logical types used by version 2 of the Parquet format.
+/// Logical types used by version 2.4.0+ of the Parquet format.
 ///
-/// *Upgrade Note*: This is an *entirely new* struct as of version
+/// This is an *entirely new* struct as of version
 /// 4.0.0. The struct previously named `LogicalType` was renamed to
 /// [`ConvertedType`]. Please see the README.md for more details.
 #[derive(Debug, Clone, PartialEq)]
@@ -847,84 +848,27 @@ impl str::FromStr for LogicalType {
 
     fn from_str(s: &str) -> result::Result<Self, Self::Err> {
         match s {
-            "INTEGER(8,true)" => Ok(LogicalType::INTEGER(IntType {
+            // The type is a placeholder that gets updated elsewhere
+            "INTEGER" => Ok(LogicalType::INTEGER(IntType {
                 bit_width: 8,
-                is_signed: true,
-            })),
-            "INTEGER(16,true)" => Ok(LogicalType::INTEGER(IntType {
-                bit_width: 16,
-                is_signed: true,
-            })),
-            "INTEGER(32,true)" => Ok(LogicalType::INTEGER(IntType {
-                bit_width: 32,
-                is_signed: true,
-            })),
-            "INTEGER(64,true)" => Ok(LogicalType::INTEGER(IntType {
-                bit_width: 64,
-                is_signed: true,
-            })),
-            "INTEGER(8,false)" => Ok(LogicalType::INTEGER(IntType {
-                bit_width: 8,
-                is_signed: false,
-            })),
-            "INTEGER(16,false)" => Ok(LogicalType::INTEGER(IntType {
-                bit_width: 16,
-                is_signed: false,
-            })),
-            "INTEGER(32,false)" => Ok(LogicalType::INTEGER(IntType {
-                bit_width: 32,
-                is_signed: false,
-            })),
-            "INTEGER(64,false)" => Ok(LogicalType::INTEGER(IntType {
-                bit_width: 64,
                 is_signed: false,
             })),
             "MAP" => Ok(LogicalType::MAP(MapType {})),
             "LIST" => Ok(LogicalType::LIST(ListType {})),
             "ENUM" => Ok(LogicalType::ENUM(EnumType {})),
-            // TODO: ARROW-11365
-            // "DECIMAL" => Ok(LogicalType::DECIMAL),
-            "DATE" => Ok(LogicalType::DATE(DateType {})),
-            "TIME(MILLIS,true)" => Ok(LogicalType::TIME(TimeType {
-                is_adjusted_to_u_t_c: true,
-                unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
-            })),
-            "TIME(MILLIS,false)" => Ok(LogicalType::TIME(TimeType {
-                is_adjusted_to_u_t_c: false,
-                unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
+            "DECIMAL" => Ok(LogicalType::DECIMAL(DecimalType {
+                precision: -1,
+                scale: -1,
             })),
-            "TIME(MICROS,true)" => Ok(LogicalType::TIME(TimeType {
-                is_adjusted_to_u_t_c: true,
-                unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
-            })),
-            "TIME(MICROS,false)" => Ok(LogicalType::TIME(TimeType {
+            "DATE" => Ok(LogicalType::DATE(DateType {})),
+            "TIME" => Ok(LogicalType::TIME(TimeType {
                 is_adjusted_to_u_t_c: false,
-                unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
-            })),
-            "TIMESTAMP(MILLIS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
-                is_adjusted_to_u_t_c: true,
                 unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
             })),
-            "TIMESTAMP(MILLIS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
+            "TIMESTAMP" => Ok(LogicalType::TIMESTAMP(TimestampType {
                 is_adjusted_to_u_t_c: false,
                 unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
             })),
-            "TIMESTAMP(MICROS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
-                is_adjusted_to_u_t_c: true,
-                unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
-            })),
-            "TIMESTAMP(MICROS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
-                is_adjusted_to_u_t_c: false,
-                unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
-            })),
-            "TIMESTAMP(NANOS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
-                is_adjusted_to_u_t_c: true,
-                unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
-            })),
-            "TIMESTAMP(NANOS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
-                is_adjusted_to_u_t_c: false,
-                unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
-            })),
             "STRING" => Ok(LogicalType::STRING(StringType {})),
             "JSON" => Ok(LogicalType::JSON(JsonType {})),
             "BSON" => Ok(LogicalType::BSON(BsonType {})),
diff --git a/rust/parquet/src/column/page.rs b/rust/parquet/src/column/page.rs
index 0573616fa8d..b3515780884 100644
--- a/rust/parquet/src/column/page.rs
+++ b/rust/parquet/src/column/page.rs
@@ -219,7 +219,7 @@ pub trait PageWriter {
 }
 
 /// An iterator over pages of some specific column in a parquet file.
-pub trait PageIterator: Iterator<Item = Result<Box<PageReader>>> {
+pub trait PageIterator: Iterator<Item = Result<Box<dyn PageReader>>> {
     /// Get schema of parquet file.
     fn schema(&mut self) -> Result<SchemaDescPtr>;
 
diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs
index d8c2e7a8ebd..1181565bdcf 100644
--- a/rust/parquet/src/column/reader.rs
+++ b/rust/parquet/src/column/reader.rs
@@ -49,7 +49,7 @@ pub enum ColumnReader {
 /// column reader will read from pages in `col_page_reader`.
 pub fn get_column_reader(
     col_descr: ColumnDescPtr,
-    col_page_reader: Box<PageReader>,
+    col_page_reader: Box<dyn PageReader>,
 ) -> ColumnReader {
     match col_descr.physical_type() {
         Type::BOOLEAN => ColumnReader::BoolColumnReader(ColumnReaderImpl::new(
@@ -106,7 +106,7 @@ pub struct ColumnReaderImpl<T: DataType> {
     descr: ColumnDescPtr,
     def_level_decoder: Option<LevelDecoder>,
     rep_level_decoder: Option<LevelDecoder>,
-    page_reader: Box<PageReader>,
+    page_reader: Box<dyn PageReader>,
     current_encoding: Option<Encoding>,
 
     // The total number of values stored in the data page.
@@ -117,12 +117,12 @@ pub struct ColumnReaderImpl<T: DataType> {
     num_decoded_values: u32,
 
     // Cache of decoders for existing encodings
-    decoders: HashMap<Encoding, Box<Decoder<T>>>,
+    decoders: HashMap<Encoding, Box<dyn Decoder<T>>>,
 }
 
 impl<T: DataType> ColumnReaderImpl<T> {
     /// Creates new column reader based on column descriptor and page reader.
-    pub fn new(descr: ColumnDescPtr, page_reader: Box<PageReader>) -> Self {
+    pub fn new(descr: ColumnDescPtr, page_reader: Box<dyn PageReader>) -> Self {
         Self {
             descr,
             def_level_decoder: None,
diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs
index 533a8e69a51..0b56594c0b6 100644
--- a/rust/parquet/src/column/writer.rs
+++ b/rust/parquet/src/column/writer.rs
@@ -78,7 +78,7 @@ macro_rules! gen_stats_section {
 pub fn get_column_writer(
     descr: ColumnDescPtr,
     props: WriterPropertiesPtr,
-    page_writer: Box<PageWriter>,
+    page_writer: Box<dyn PageWriter>,
 ) -> ColumnWriter {
     match descr.physical_type() {
         Type::BOOLEAN => ColumnWriter::BoolColumnWriter(ColumnWriterImpl::new(
@@ -166,12 +166,12 @@ pub struct ColumnWriterImpl<T: DataType> {
     // Column writer properties
     descr: ColumnDescPtr,
     props: WriterPropertiesPtr,
-    page_writer: Box<PageWriter>,
+    page_writer: Box<dyn PageWriter>,
     has_dictionary: bool,
     dict_encoder: Option<DictEncoder<T>>,
-    encoder: Box<Encoder<T>>,
+    encoder: Box<dyn Encoder<T>>,
     codec: Compression,
-    compressor: Option<Box<Codec>>,
+    compressor: Option<Box<dyn Codec>>,
     // Metrics per page
     num_buffered_values: u32,
     num_buffered_encoded_values: u32,
@@ -203,7 +203,7 @@ impl<T: DataType> ColumnWriterImpl<T> {
     pub fn new(
         descr: ColumnDescPtr,
         props: WriterPropertiesPtr,
-        page_writer: Box<PageWriter>,
+        page_writer: Box<dyn PageWriter>,
     ) -> Self {
         let codec = props.compression(descr.path());
         let compressor = create_codec(codec).unwrap();
@@ -879,8 +879,8 @@ impl<T: DataType> ColumnWriterImpl<T> {
 
     /// Returns reference to the underlying page writer.
     /// This method is intended to use in tests only.
-    fn get_page_writer_ref(&self) -> &Box<PageWriter> {
-        &self.page_writer
+    fn get_page_writer_ref(&self) -> &dyn PageWriter {
+        self.page_writer.as_ref()
     }
 
     fn make_column_statistics(&self) -> Statistics {
@@ -1842,7 +1842,7 @@ mod tests {
 
     /// Returns column writer.
     fn get_test_column_writer<T: DataType>(
-        page_writer: Box<PageWriter>,
+        page_writer: Box<dyn PageWriter>,
         max_def_level: i16,
         max_rep_level: i16,
         props: WriterPropertiesPtr,
@@ -1854,7 +1854,7 @@ mod tests {
 
     /// Returns column reader.
     fn get_test_column_reader<T: DataType>(
-        page_reader: Box<PageReader>,
+        page_reader: Box<dyn PageReader>,
         max_def_level: i16,
         max_rep_level: i16,
     ) -> ColumnReaderImpl<T> {
@@ -1879,7 +1879,7 @@ mod tests {
     }
 
     /// Returns page writer that collects pages without serializing them.
-    fn get_test_page_writer() -> Box<PageWriter> {
+    fn get_test_page_writer() -> Box<dyn PageWriter> {
         Box::new(TestPageWriter {})
     }
 
diff --git a/rust/parquet/src/compression.rs b/rust/parquet/src/compression.rs
index 16fe487189f..a1155971fbd 100644
--- a/rust/parquet/src/compression.rs
+++ b/rust/parquet/src/compression.rs
@@ -60,7 +60,7 @@ pub trait Codec {
 /// Given the compression type `codec`, returns a codec used to compress and decompress
 /// bytes for the compression type.
 /// This returns `None` if the codec type is `UNCOMPRESSED`.
-pub fn create_codec(codec: CodecType) -> Result<Option<Box<Codec>>> {
+pub fn create_codec(codec: CodecType) -> Result<Option<Box<dyn Codec>>> {
     match codec {
         #[cfg(any(feature = "brotli", test))]
         CodecType::BROTLI => Ok(Some(Box::new(BrotliCodec::new()))),
diff --git a/rust/parquet/src/data_type.rs b/rust/parquet/src/data_type.rs
index ecce77a8910..aa1def3db97 100644
--- a/rust/parquet/src/data_type.rs
+++ b/rust/parquet/src/data_type.rs
@@ -117,19 +117,19 @@ pub struct ByteArray {
 impl PartialOrd for ByteArray {
     fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
         if self.data.is_some() && other.data.is_some() {
-            if self.len() > other.len() {
-                Some(Ordering::Greater)
-            } else if self.len() < other.len() {
-                Some(Ordering::Less)
-            } else {
-                for (v1, v2) in self.data().iter().zip(other.data().iter()) {
-                    if *v1 > *v2 {
-                        return Some(Ordering::Greater);
-                    } else if *v1 < *v2 {
-                        return Some(Ordering::Less);
+            match self.len().cmp(&other.len()) {
+                Ordering::Greater => Some(Ordering::Greater),
+                Ordering::Less => Some(Ordering::Less),
+                Ordering::Equal => {
+                    for (v1, v2) in self.data().iter().zip(other.data().iter()) {
+                        match v1.cmp(v2) {
+                            Ordering::Greater => return Some(Ordering::Greater),
+                            Ordering::Less => return Some(Ordering::Less),
+                            _ => {}
+                        }
                     }
+                    Some(Ordering::Equal)
                 }
-                Some(Ordering::Equal)
             }
         } else {
             None
diff --git a/rust/parquet/src/encodings/decoding.rs b/rust/parquet/src/encodings/decoding.rs
index ee7ad5ae95c..b73ebf0285c 100644
--- a/rust/parquet/src/encodings/decoding.rs
+++ b/rust/parquet/src/encodings/decoding.rs
@@ -108,8 +108,8 @@ pub trait Decoder<T: DataType> {
 pub fn get_decoder<T: DataType>(
     descr: ColumnDescPtr,
     encoding: Encoding,
-) -> Result<Box<Decoder<T>>> {
-    let decoder: Box<Decoder<T>> = match encoding {
+) -> Result<Box<dyn Decoder<T>>> {
+    let decoder: Box<dyn Decoder<T>> = match encoding {
         Encoding::PLAIN => Box::new(PlainDecoder::new(descr.type_length())),
         Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => {
             return Err(general_err!(
@@ -231,7 +231,7 @@ impl<T: DataType> DictDecoder<T> {
     }
 
     /// Decodes and sets values for dictionary using `decoder` decoder.
-    pub fn set_dict(&mut self, mut decoder: Box<Decoder<T>>) -> Result<()> {
+    pub fn set_dict(&mut self, mut decoder: Box<dyn Decoder<T>>) -> Result<()> {
         let num_values = decoder.values_left();
         self.dictionary.resize(num_values, T::T::default());
         let _ = decoder.get(&mut self.dictionary)?;
diff --git a/rust/parquet/src/encodings/encoding.rs b/rust/parquet/src/encodings/encoding.rs
index fdd616e9e27..d04273817e1 100644
--- a/rust/parquet/src/encodings/encoding.rs
+++ b/rust/parquet/src/encodings/encoding.rs
@@ -77,8 +77,8 @@ pub fn get_encoder<T: DataType>(
     desc: ColumnDescPtr,
     encoding: Encoding,
     mem_tracker: MemTrackerPtr,
-) -> Result<Box<Encoder<T>>> {
-    let encoder: Box<Encoder<T>> = match encoding {
+) -> Result<Box<dyn Encoder<T>>> {
+    let encoder: Box<dyn Encoder<T>> = match encoding {
         Encoding::PLAIN => Box::new(PlainEncoder::new(desc, mem_tracker, vec![])),
         Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => {
             return Err(general_err!(
@@ -1261,8 +1261,8 @@ mod tests {
     }
 
     fn put_and_get<T: DataType>(
-        encoder: &mut Box<Encoder<T>>,
-        decoder: &mut Box<Decoder<T>>,
+        encoder: &mut Box<dyn Encoder<T>>,
+        decoder: &mut Box<dyn Decoder<T>>,
         input: &[T::T],
         output: &mut [T::T],
     ) -> Result<usize> {
@@ -1305,13 +1305,19 @@ mod tests {
         ))
     }
 
-    fn create_test_encoder<T: DataType>(type_len: i32, enc: Encoding) -> Box<Encoder<T>> {
+    fn create_test_encoder<T: DataType>(
+        type_len: i32,
+        enc: Encoding,
+    ) -> Box<dyn Encoder<T>> {
         let desc = create_test_col_desc_ptr(type_len, T::get_physical_type());
         let mem_tracker = Arc::new(MemTracker::new());
         get_encoder(desc, enc, mem_tracker).unwrap()
     }
 
-    fn create_test_decoder<T: DataType>(type_len: i32, enc: Encoding) -> Box<Decoder<T>> {
+    fn create_test_decoder<T: DataType>(
+        type_len: i32,
+        enc: Encoding,
+    ) -> Box<dyn Decoder<T>> {
         let desc = create_test_col_desc_ptr(type_len, T::get_physical_type());
         get_decoder(desc, enc).unwrap()
     }
diff --git a/rust/parquet/src/errors.rs b/rust/parquet/src/errors.rs
index 18673a44a82..021c1f063f8 100644
--- a/rust/parquet/src/errors.rs
+++ b/rust/parquet/src/errors.rs
@@ -59,7 +59,7 @@ impl std::fmt::Display for ParquetError {
 }
 
 impl std::error::Error for ParquetError {
-    fn cause(&self) -> Option<&::std::error::Error> {
+    fn cause(&self) -> Option<&dyn ::std::error::Error> {
         None
     }
 }
diff --git a/rust/parquet/src/file/reader.rs b/rust/parquet/src/file/reader.rs
index 7fb8ee211cd..aa8ba83a6c0 100644
--- a/rust/parquet/src/file/reader.rs
+++ b/rust/parquet/src/file/reader.rs
@@ -63,7 +63,7 @@ pub trait FileReader {
     fn num_row_groups(&self) -> usize;
 
     /// Get the `i`th row group reader. Note this doesn't do bound check.
-    fn get_row_group(&self, i: usize) -> Result<Box<RowGroupReader + '_>>;
+    fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>>;
 
     /// Get full iterator of `Row`s from a file (over all row groups).
     ///
@@ -84,7 +84,7 @@ pub trait RowGroupReader {
     fn num_columns(&self) -> usize;
 
     /// Get page reader for the `i`th column chunk.
-    fn get_column_page_reader(&self, i: usize) -> Result<Box<PageReader>>;
+    fn get_column_page_reader(&self, i: usize) -> Result<Box<dyn PageReader>>;
 
     /// Get value reader for the `i`th column chunk.
     fn get_column_reader(&self, i: usize) -> Result<ColumnReader> {
@@ -139,13 +139,13 @@ pub trait RowGroupReader {
 /// Implementation of page iterator for parquet file.
 pub struct FilePageIterator {
     column_index: usize,
-    row_group_indices: Box<Iterator<Item = usize>>,
-    file_reader: Arc<FileReader>,
+    row_group_indices: Box<dyn Iterator<Item = usize>>,
+    file_reader: Arc<dyn FileReader>,
 }
 
 impl FilePageIterator {
     /// Creates a page iterator for all row groups in file.
-    pub fn new(column_index: usize, file_reader: Arc<FileReader>) -> Result<Self> {
+    pub fn new(column_index: usize, file_reader: Arc<dyn FileReader>) -> Result<Self> {
         let num_row_groups = file_reader.metadata().num_row_groups();
 
         let row_group_indices = Box::new(0..num_row_groups);
@@ -156,8 +156,8 @@ impl FilePageIterator {
     /// Create page iterator from parquet file reader with only some row groups.
     pub fn with_row_groups(
         column_index: usize,
-        row_group_indices: Box<Iterator<Item = usize>>,
-        file_reader: Arc<FileReader>,
+        row_group_indices: Box<dyn Iterator<Item = usize>>,
+        file_reader: Arc<dyn FileReader>,
     ) -> Result<Self> {
         // Check that column_index is valid
         let num_columns = file_reader
@@ -180,9 +180,9 @@ impl FilePageIterator {
 }
 
 impl Iterator for FilePageIterator {
-    type Item = Result<Box<PageReader>>;
+    type Item = Result<Box<dyn PageReader>>;
 
-    fn next(&mut self) -> Option<Result<Box<PageReader>>> {
+    fn next(&mut self) -> Option<Result<Box<dyn PageReader>>> {
         self.row_group_indices.next().map(|row_group_index| {
             self.file_reader
                 .get_row_group(row_group_index)
diff --git a/rust/parquet/src/file/serialized_reader.rs b/rust/parquet/src/file/serialized_reader.rs
index b0d1d0c7b31..0877e622ce4 100644
--- a/rust/parquet/src/file/serialized_reader.rs
+++ b/rust/parquet/src/file/serialized_reader.rs
@@ -166,7 +166,7 @@ impl<R: 'static + ChunkReader> FileReader for SerializedFileReader<R> {
         self.metadata.num_row_groups()
     }
 
-    fn get_row_group(&self, i: usize) -> Result<Box<RowGroupReader + '_>> {
+    fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>> {
         let row_group_metadata = self.metadata.row_group(i);
         // Row groups should be processed sequentially.
         let f = Arc::clone(&self.chunk_reader);
@@ -207,7 +207,7 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'
     }
 
     // TODO: fix PARQUET-816
-    fn get_column_page_reader(&self, i: usize) -> Result<Box<PageReader>> {
+    fn get_column_page_reader(&self, i: usize) -> Result<Box<dyn PageReader>> {
         let col = self.metadata.column(i);
         let (col_start, col_length) = col.byte_range();
         let file_chunk = self.chunk_reader.get_read(col_start, col_length as usize)?;
@@ -232,7 +232,7 @@ pub struct SerializedPageReader<T: Read> {
     buf: T,
 
     // The compression codec for this column chunk. Only set for non-PLAIN codec.
-    decompressor: Option<Box<Codec>>,
+    decompressor: Option<Box<dyn Codec>>,
 
     // The number of values we have seen so far.
     seen_num_values: i64,
@@ -544,7 +544,7 @@ mod tests {
         // Test row group reader
         let row_group_reader_result = reader.get_row_group(0);
         assert!(row_group_reader_result.is_ok());
-        let row_group_reader: Box<RowGroupReader> = row_group_reader_result.unwrap();
+        let row_group_reader: Box<dyn RowGroupReader> = row_group_reader_result.unwrap();
         assert_eq!(
             row_group_reader.num_columns(),
             row_group_metadata.num_columns()
@@ -558,7 +558,7 @@ mod tests {
         // TODO: test for every column
         let page_reader_0_result = row_group_reader.get_column_page_reader(0);
         assert!(page_reader_0_result.is_ok());
-        let mut page_reader_0: Box<PageReader> = page_reader_0_result.unwrap();
+        let mut page_reader_0: Box<dyn PageReader> = page_reader_0_result.unwrap();
         let mut page_count = 0;
         while let Ok(Some(page)) = page_reader_0.get_next_page() {
             let is_expected_page = match page {
@@ -636,7 +636,7 @@ mod tests {
         // Test row group reader
         let row_group_reader_result = reader.get_row_group(0);
         assert!(row_group_reader_result.is_ok());
-        let row_group_reader: Box<RowGroupReader> = row_group_reader_result.unwrap();
+        let row_group_reader: Box<dyn RowGroupReader> = row_group_reader_result.unwrap();
         assert_eq!(
             row_group_reader.num_columns(),
             row_group_metadata.num_columns()
@@ -650,7 +650,7 @@ mod tests {
         // TODO: test for every column
         let page_reader_0_result = row_group_reader.get_column_page_reader(0);
         assert!(page_reader_0_result.is_ok());
-        let mut page_reader_0: Box<PageReader> = page_reader_0_result.unwrap();
+        let mut page_reader_0: Box<dyn PageReader> = page_reader_0_result.unwrap();
         let mut page_count = 0;
         while let Ok(Some(page)) = page_reader_0.get_next_page() {
             let is_expected_page = match page {
diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs
index 265014bf683..e1c2dc6b616 100644
--- a/rust/parquet/src/file/writer.rs
+++ b/rust/parquet/src/file/writer.rs
@@ -66,20 +66,23 @@ pub trait FileWriter {
     /// There is no limit on a number of row groups in a file; however, row groups have
     /// to be written sequentially. Every time the next row group is requested, the
     /// previous row group must be finalised and closed using `close_row_group` method.
-    fn next_row_group(&mut self) -> Result<Box<RowGroupWriter>>;
+    fn next_row_group(&mut self) -> Result<Box<dyn RowGroupWriter>>;
 
     /// Finalises and closes row group that was created using `next_row_group` method.
     /// After calling this method, the next row group is available for writes.
-    fn close_row_group(&mut self, row_group_writer: Box<RowGroupWriter>) -> Result<()>;
+    fn close_row_group(
+        &mut self,
+        row_group_writer: Box<dyn RowGroupWriter>,
+    ) -> Result<()>;
 
-    /// Closes and finalises file writer.
+    /// Closes and finalises file writer, returning the file metadata.
     ///
     /// All row groups must be appended before this method is called.
     /// No writes are allowed after this point.
     ///
     /// Can be called multiple times. It is up to implementation to either result in
     /// no-op, or return an `Err` for subsequent calls.
-    fn close(&mut self) -> Result<()>;
+    fn close(&mut self) -> Result<parquet::FileMetaData>;
 }
 
 /// Parquet row group writer API.
@@ -165,7 +168,7 @@ impl<W: ParquetWriter> SerializedFileWriter<W> {
     /// Finalises active row group writer, otherwise no-op.
     fn finalise_row_group_writer(
         &mut self,
-        mut row_group_writer: Box<RowGroupWriter>,
+        mut row_group_writer: Box<dyn RowGroupWriter>,
     ) -> Result<()> {
         let row_group_metadata = row_group_writer.close()?;
         self.total_num_rows += row_group_metadata.num_rows();
@@ -174,7 +177,7 @@ impl<W: ParquetWriter> SerializedFileWriter<W> {
     }
 
     /// Assembles and writes metadata at the end of the file.
-    fn write_metadata(&mut self) -> Result<()> {
+    fn write_metadata(&mut self) -> Result<parquet::FileMetaData> {
         let file_metadata = parquet::FileMetaData {
             version: self.props.writer_version().as_num(),
             schema: types::to_thrift(self.schema.as_ref())?,
@@ -205,7 +208,7 @@ impl<W: ParquetWriter> SerializedFileWriter<W> {
         LittleEndian::write_i32(&mut footer_buffer, metadata_len);
         (&mut footer_buffer[4..]).write_all(&PARQUET_MAGIC)?;
         self.buf.write_all(&footer_buffer)?;
-        Ok(())
+        Ok(file_metadata)
     }
 
     #[inline]
@@ -229,7 +232,7 @@ impl<W: ParquetWriter> SerializedFileWriter<W> {
 
 impl<W: 'static + ParquetWriter> FileWriter for SerializedFileWriter<W> {
     #[inline]
-    fn next_row_group(&mut self) -> Result<Box<RowGroupWriter>> {
+    fn next_row_group(&mut self) -> Result<Box<dyn RowGroupWriter>> {
         self.assert_closed()?;
         self.assert_previous_writer_closed()?;
         let row_group_writer = SerializedRowGroupWriter::new(
@@ -242,7 +245,10 @@ impl<W: 'static + ParquetWriter> FileWriter for SerializedFileWriter<W> {
     }
 
     #[inline]
-    fn close_row_group(&mut self, row_group_writer: Box<RowGroupWriter>) -> Result<()> {
+    fn close_row_group(
+        &mut self,
+        row_group_writer: Box<dyn RowGroupWriter>,
+    ) -> Result<()> {
         self.assert_closed()?;
         let res = self.finalise_row_group_writer(row_group_writer);
         self.previous_writer_closed = res.is_ok();
@@ -250,12 +256,12 @@ impl<W: 'static + ParquetWriter> FileWriter for SerializedFileWriter<W> {
     }
 
     #[inline]
-    fn close(&mut self) -> Result<()> {
+    fn close(&mut self) -> Result<parquet::FileMetaData> {
         self.assert_closed()?;
         self.assert_previous_writer_closed()?;
-        self.write_metadata()?;
+        let metadata = self.write_metadata()?;
         self.is_closed = true;
-        Ok(())
+        Ok(metadata)
     }
 }
 
@@ -993,7 +999,7 @@ mod tests {
     }
 
     /// Helper function to compress a slice
-    fn compress_helper(compressor: Option<&mut Box<Codec>>, data: &[u8]) -> Vec<u8> {
+    fn compress_helper(compressor: Option<&mut Box<dyn Codec>>, data: &[u8]) -> Vec<u8> {
         let mut output_buf = vec![];
         if let Some(cmpr) = compressor {
             cmpr.compress(data, &mut output_buf).unwrap();
diff --git a/rust/parquet/src/lib.rs b/rust/parquet/src/lib.rs
index 5ce63bbdedc..a931b95622d 100644
--- a/rust/parquet/src/lib.rs
+++ b/rust/parquet/src/lib.rs
@@ -18,21 +18,21 @@
 #![allow(incomplete_features)]
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]
-#![allow(bare_trait_objects)]
 #![allow(
     clippy::approx_constant,
-    clippy::borrowed_box,
     clippy::cast_ptr_alignment,
-    clippy::comparison_chain,
     clippy::float_cmp,
     clippy::float_equality_without_abs,
+    clippy::from_over_into,
     clippy::many_single_char_names,
     clippy::needless_range_loop,
     clippy::new_without_default,
     clippy::or_fun_call,
     clippy::same_item_push,
     clippy::too_many_arguments,
-    clippy::transmute_ptr_to_ptr
+    clippy::transmute_ptr_to_ptr,
+    clippy::upper_case_acronyms,
+    clippy::vec_init_then_push
 )]
 
 #[macro_use]
diff --git a/rust/parquet/src/record/api.rs b/rust/parquet/src/record/api.rs
index 07f82160db4..411016e7ce8 100644
--- a/rust/parquet/src/record/api.rs
+++ b/rust/parquet/src/record/api.rs
@@ -134,7 +134,7 @@ pub trait RowAccessor {
 
 /// Trait for formating fields within a Row.
 pub trait RowFormatter {
-    fn fmt(&self, i: usize) -> &fmt::Display;
+    fn fmt(&self, i: usize) -> &dyn fmt::Display;
 }
 
 /// Macro to generate type-safe get_xxx methods for primitive types,
@@ -173,7 +173,7 @@ macro_rules! row_complex_accessor {
 
 impl RowFormatter for Row {
     /// Get Display reference for a given field.
-    fn fmt(&self, i: usize) -> &fmt::Display {
+    fn fmt(&self, i: usize) -> &dyn fmt::Display {
         &self.fields[i].1
     }
 }
@@ -387,8 +387,8 @@ pub fn make_map(entries: Vec<(Field, Field)>) -> Map {
 
 /// Trait for type-safe access of an index for a `Map`
 pub trait MapAccessor {
-    fn get_keys<'a>(&'a self) -> Box<ListAccessor + 'a>;
-    fn get_values<'a>(&'a self) -> Box<ListAccessor + 'a>;
+    fn get_keys<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
+    fn get_values<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
 }
 
 struct MapList<'a> {
@@ -453,14 +453,14 @@ impl<'a> ListAccessor for MapList<'a> {
 }
 
 impl MapAccessor for Map {
-    fn get_keys<'a>(&'a self) -> Box<ListAccessor + 'a> {
+    fn get_keys<'a>(&'a self) -> Box<dyn ListAccessor + 'a> {
         let map_list = MapList {
             elements: self.entries.iter().map(|v| &v.0).collect(),
         };
         Box::new(map_list)
     }
 
-    fn get_values<'a>(&'a self) -> Box<ListAccessor + 'a> {
+    fn get_values<'a>(&'a self) -> Box<dyn ListAccessor + 'a> {
         let map_list = MapList {
             elements: self.entries.iter().map(|v| &v.1).collect(),
         };
diff --git a/rust/parquet/src/record/reader.rs b/rust/parquet/src/record/reader.rs
index 2323cd17b71..691afe8c203 100644
--- a/rust/parquet/src/record/reader.rs
+++ b/rust/parquet/src/record/reader.rs
@@ -58,7 +58,7 @@ impl TreeBuilder {
     pub fn build(
         &self,
         descr: SchemaDescPtr,
-        row_group_reader: &RowGroupReader,
+        row_group_reader: &dyn RowGroupReader,
     ) -> Reader {
         // Prepare lookup table of column path -> original column index
         // This allows to prune columns and map schema leaf nodes to the column readers
@@ -96,7 +96,7 @@ impl TreeBuilder {
     pub fn as_iter(
         &self,
         descr: SchemaDescPtr,
-        row_group_reader: &RowGroupReader,
+        row_group_reader: &dyn RowGroupReader,
     ) -> ReaderIter {
         let num_records = row_group_reader.metadata().num_rows() as usize;
         ReaderIter::new(self.build(descr, row_group_reader), num_records)
@@ -110,7 +110,7 @@ impl TreeBuilder {
         mut curr_def_level: i16,
         mut curr_rep_level: i16,
         paths: &HashMap<ColumnPath, usize>,
-        row_group_reader: &RowGroupReader,
+        row_group_reader: &dyn RowGroupReader,
     ) -> Reader {
         assert!(field.get_basic_info().has_repetition());
         // Update current definition and repetition levels for this type
@@ -615,12 +615,12 @@ impl fmt::Display for Reader {
 /// The enum Either with variants That represet a reference and a box of
 /// [`FileReader`](crate::file::reader::FileReader).
 enum Either<'a> {
-    Left(&'a FileReader),
-    Right(Box<FileReader>),
+    Left(&'a dyn FileReader),
+    Right(Box<dyn FileReader>),
 }
 
 impl<'a> Either<'a> {
-    fn reader(&self) -> &FileReader {
+    fn reader(&self) -> &dyn FileReader {
         match *self {
             Either::Left(r) => r,
             Either::Right(ref r) => &**r,
@@ -665,7 +665,7 @@ impl<'a> RowIter<'a> {
 
     /// Creates iterator of [`Row`](crate::record::Row)s for all row groups in a
     /// file.
-    pub fn from_file(proj: Option<Type>, reader: &'a FileReader) -> Result<Self> {
+    pub fn from_file(proj: Option<Type>, reader: &'a dyn FileReader) -> Result<Self> {
         let either = Either::Left(reader);
         let descr = Self::get_proj_descr(
             proj,
@@ -678,7 +678,7 @@ impl<'a> RowIter<'a> {
     /// Creates iterator of [`Row`](crate::record::Row)s for a specific row group.
     pub fn from_row_group(
         proj: Option<Type>,
-        reader: &'a RowGroupReader,
+        reader: &'a dyn RowGroupReader,
     ) -> Result<Self> {
         let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?;
         let tree_builder = Self::tree_builder();
@@ -691,7 +691,7 @@ impl<'a> RowIter<'a> {
 
     /// Creates a iterator of [`Row`](crate::record::Row)s from a
     /// [`FileReader`](crate::file::reader::FileReader) using the full file schema.
-    pub fn from_file_into(reader: Box<FileReader>) -> Self {
+    pub fn from_file_into(reader: Box<dyn FileReader>) -> Self {
         let either = Either::Right(reader);
         let descr = either
             .reader()
@@ -1650,14 +1650,14 @@ mod tests {
 
     fn test_file_reader_rows(file_name: &str, schema: Option<Type>) -> Result<Vec<Row>> {
         let file = get_test_file(file_name);
-        let file_reader: Box<FileReader> = Box::new(SerializedFileReader::new(file)?);
+        let file_reader: Box<dyn FileReader> = Box::new(SerializedFileReader::new(file)?);
         let iter = file_reader.get_row_iter(schema)?;
         Ok(iter.collect())
     }
 
     fn test_row_group_rows(file_name: &str, schema: Option<Type>) -> Result<Vec<Row>> {
         let file = get_test_file(file_name);
-        let file_reader: Box<FileReader> = Box::new(SerializedFileReader::new(file)?);
+        let file_reader: Box<dyn FileReader> = Box::new(SerializedFileReader::new(file)?);
         // Check the first row group only, because files will contain only single row
         // group
         let row_group_reader = file_reader.get_row_group(0).unwrap();
diff --git a/rust/parquet/src/record/record_writer.rs b/rust/parquet/src/record/record_writer.rs
index 00ce9fd4e47..56817eb2eca 100644
--- a/rust/parquet/src/record/record_writer.rs
+++ b/rust/parquet/src/record/record_writer.rs
@@ -21,6 +21,6 @@ use super::super::file::writer::RowGroupWriter;
 pub trait RecordWriter<T> {
     fn write_to_row_group(
         &self,
-        row_group_writer: &mut Box<RowGroupWriter>,
+        row_group_writer: &mut Box<dyn RowGroupWriter>,
     ) -> Result<(), ParquetError>;
 }
diff --git a/rust/parquet/src/schema/parser.rs b/rust/parquet/src/schema/parser.rs
index 1f7db5b410d..3ce347c8745 100644
--- a/rust/parquet/src/schema/parser.rs
+++ b/rust/parquet/src/schema/parser.rs
@@ -44,7 +44,10 @@
 
 use std::sync::Arc;
 
-use crate::basic::{ConvertedType, Repetition, Type as PhysicalType};
+use crate::basic::{
+    ConvertedType, DecimalType, IntType, LogicalType, Repetition, TimeType, TimeUnit,
+    TimestampType, Type as PhysicalType,
+};
 use crate::errors::{ParquetError, Result};
 use crate::schema::types::{Type, TypePtr};
 
@@ -150,6 +153,7 @@ fn assert_token(token: Option<&str>, expected: &str) -> Result<()> {
 }
 
 // Utility function to parse i32 or return general error.
+#[inline]
 fn parse_i32(
     value: Option<&str>,
     not_found_msg: &str,
@@ -160,6 +164,38 @@ fn parse_i32(
         .and_then(|v| v.parse::<i32>().map_err(|_| general_err!(parse_fail_msg)))
 }
 
+// Utility function to parse boolean or return general error.
+#[inline]
+fn parse_bool(
+    value: Option<&str>,
+    not_found_msg: &str,
+    parse_fail_msg: &str,
+) -> Result<bool> {
+    value
+        .ok_or_else(|| general_err!(not_found_msg))
+        .and_then(|v| {
+            v.to_lowercase()
+                .parse::<bool>()
+                .map_err(|_| general_err!(parse_fail_msg))
+        })
+}
+
+// Utility function to parse TimeUnit or return general error.
+fn parse_timeunit(
+    value: Option<&str>,
+    not_found_msg: &str,
+    parse_fail_msg: &str,
+) -> Result<TimeUnit> {
+    value
+        .ok_or_else(|| general_err!(not_found_msg))
+        .and_then(|v| match v.to_uppercase().as_str() {
+            "MILLIS" => Ok(TimeUnit::MILLIS(Default::default())),
+            "MICROS" => Ok(TimeUnit::MICROS(Default::default())),
+            "NANOS" => Ok(TimeUnit::NANOS(Default::default())),
+            _ => Err(general_err!(parse_fail_msg)),
+        })
+}
+
 impl<'a> Parser<'a> {
     // Entry function to parse message type, uses internal tokenizer.
     fn parse_message_type(&mut self) -> Result<Type> {
@@ -222,18 +258,29 @@ impl<'a> Parser<'a> {
             .next()
             .ok_or_else(|| general_err!("Expected name, found None"))?;
 
-        // Parse converted type if exists
-        let converted_type = if let Some("(") = self.tokenizer.next() {
+        // Parse logical or converted type if exists
+        let (logical_type, converted_type) = if let Some("(") = self.tokenizer.next() {
             let tpe = self
                 .tokenizer
                 .next()
                 .ok_or_else(|| general_err!("Expected converted type, found None"))
-                .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+                .and_then(|v| {
+                    // Try logical type first
+                    let upper = v.to_uppercase();
+                    let logical = upper.parse::<LogicalType>();
+                    match logical {
+                        Ok(logical) => Ok((
+                            Some(logical.clone()),
+                            ConvertedType::from(Some(logical)),
+                        )),
+                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+                    }
+                })?;
             assert_token(self.tokenizer.next(), ")")?;
             tpe
         } else {
             self.tokenizer.backtrack();
-            ConvertedType::NONE
+            (None, ConvertedType::NONE)
         };
 
         // Parse optional id
@@ -246,6 +293,7 @@ impl<'a> Parser<'a> {
 
         let mut fields = self.parse_child_types()?;
         let mut builder = Type::group_type_builder(name)
+            .with_logical_type(logical_type)
             .with_converted_type(converted_type)
             .with_fields(&mut fields);
         if let Some(rep) = repetition {
@@ -281,19 +329,159 @@ impl<'a> Parser<'a> {
             .ok_or_else(|| general_err!("Expected name, found None"))?;
 
         // Parse converted type
-        let (converted_type, precision, scale) = if let Some("(") = self.tokenizer.next()
+        let (logical_type, converted_type, precision, scale) = if let Some("(") =
+            self.tokenizer.next()
         {
-            let tpe = self
+            let (mut logical, mut converted) = self
                 .tokenizer
                 .next()
-                .ok_or_else(|| general_err!("Expected converted type, found None"))
-                .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+                .ok_or_else(|| {
+                    general_err!("Expected logical or converted type, found None")
+                })
+                .and_then(|v| {
+                    let upper = v.to_uppercase();
+                    let logical = upper.parse::<LogicalType>();
+                    match logical {
+                        Ok(logical) => Ok((
+                            Some(logical.clone()),
+                            ConvertedType::from(Some(logical)),
+                        )),
+                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+                    }
+                })?;
 
             // Parse precision and scale for decimals
             let mut precision: i32 = -1;
             let mut scale: i32 = -1;
 
-            if tpe == ConvertedType::DECIMAL {
+            // Parse the concrete logical type
+            if let Some(tpe) = &logical {
+                match tpe {
+                    LogicalType::DECIMAL(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            precision = parse_i32(
+                                self.tokenizer.next(),
+                                "Expected precision, found None",
+                                "Failed to parse precision for DECIMAL type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                scale = parse_i32(
+                                    self.tokenizer.next(),
+                                    "Expected scale, found None",
+                                    "Failed to parse scale for DECIMAL type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = Some(LogicalType::DECIMAL(DecimalType {
+                                    precision,
+                                    scale,
+                                }));
+                                converted = ConvertedType::from(logical.clone());
+                            } else {
+                                scale = 0;
+                                logical = Some(LogicalType::DECIMAL(DecimalType {
+                                    precision,
+                                    scale,
+                                }));
+                                converted = ConvertedType::from(logical.clone());
+                            }
+                        }
+                    }
+                    LogicalType::TIME(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let unit = parse_timeunit(
+                                self.tokenizer.next(),
+                                "Invalid timeunit found",
+                                "Failed to parse timeunit for TIME type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_adjusted_to_u_t_c = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failed to parse timezone info for TIME type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = Some(LogicalType::TIME(TimeType {
+                                    unit,
+                                    is_adjusted_to_u_t_c,
+                                }));
+                                converted = ConvertedType::from(logical.clone());
+                            } else {
+                                // Invalid token for unit
+                                self.tokenizer.backtrack();
+                            }
+                        }
+                    }
+                    LogicalType::TIMESTAMP(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let unit = parse_timeunit(
+                                self.tokenizer.next(),
+                                "Invalid timeunit found",
+                                "Failed to parse timeunit for TIMESTAMP type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_adjusted_to_u_t_c = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failed to parse timezone info for TIMESTAMP type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = Some(LogicalType::TIMESTAMP(TimestampType {
+                                    unit,
+                                    is_adjusted_to_u_t_c,
+                                }));
+                                converted = ConvertedType::from(logical.clone());
+                            } else {
+                                // Invalid token for unit
+                                self.tokenizer.backtrack();
+                            }
+                        }
+                    }
+                    LogicalType::INTEGER(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let bit_width = parse_i32(
+                                self.tokenizer.next(),
+                                "Invalid bit_width found",
+                                "Failed to parse bit_width for INTEGER type",
+                            )? as i8;
+                            match physical_type {
+                                PhysicalType::INT32 => {
+                                    match bit_width {
+                                        8 | 16 | 32 => {}
+                                        _ => {
+                                            return Err(general_err!("Incorrect bit width {} for INT32", bit_width))
+                                        }
+                                    }
+                                }
+                                PhysicalType::INT64 => {
+                                    if bit_width != 64 {
+                                        return Err(general_err!("Incorrect bit width {} for INT64", bit_width))
+                                    }
+                                }
+                                _ => {
+                                    return Err(general_err!("Logical type INTEGER cannot be used with physical type {}", physical_type))
+                                }
+                            }
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_signed = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failed to parse is_signed for INTEGER type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = Some(LogicalType::INTEGER(IntType {
+                                    bit_width,
+                                    is_signed,
+                                }));
+                                converted = ConvertedType::from(logical.clone());
+                            } else {
+                                // Invalid token for unit
+                                self.tokenizer.backtrack();
+                            }
+                        }
+                    }
+                    _ => {}
+                }
+            } else if converted == ConvertedType::DECIMAL {
                 if let Some("(") = self.tokenizer.next() {
                     // Parse precision
                     precision = parse_i32(
@@ -322,10 +510,10 @@ impl<'a> Parser<'a> {
             }
 
             assert_token(self.tokenizer.next(), ")")?;
-            (tpe, precision, scale)
+            (logical, converted, precision, scale)
         } else {
             self.tokenizer.backtrack();
-            (ConvertedType::NONE, -1, -1)
+            (None, ConvertedType::NONE, -1, -1)
         };
 
         // Parse optional id
@@ -339,6 +527,7 @@ impl<'a> Parser<'a> {
 
         let mut builder = Type::primitive_type_builder(name, physical_type)
             .with_repetition(repetition)
+            .with_logical_type(logical_type)
             .with_converted_type(converted_type)
             .with_length(length)
             .with_precision(precision)
@@ -503,6 +692,149 @@ mod tests {
         assert!(result.is_ok());
     }
 
+    #[test]
+    fn test_parse_message_type_integer() {
+        // Invalid integer syntax
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER());
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert_eq!(
+            result,
+            Err(general_err!("Failed to parse bit_width for INTEGER type"))
+        );
+
+        // Invalid integer syntax, needs both bit-width and UTC sign
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER(32,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert_eq!(
+            result,
+            Err(general_err!("Incorrect bit width 32 for INT64"))
+        );
+
+        // Invalid integer because of non-numeric bit width
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(eight,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert_eq!(
+            result,
+            Err(general_err!("Failed to parse bit_width for INTEGER type"))
+        );
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(8,false));
+      optional int32 f2 (INTEGER(8,true));
+      optional int32 f3 (INTEGER(16,false));
+      optional int32 f4 (INTEGER(16,true));
+      optional int32 f5 (INTEGER(32,false));
+      optional int32 f6 (INTEGER(32,true));
+      optional int64 f7 (INTEGER(64,false));
+      optional int64 f7 (INTEGER(64,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_parse_message_type_temporal() {
+        // Invalid timestamp syntax
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP();
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert_eq!(
+            result,
+            Err(general_err!("Failed to parse timeunit for TIMESTAMP type"))
+        );
+
+        // Invalid timestamp syntax, needs both unit and UTC adjustment
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP(MILLIS,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert_eq!(
+            result,
+            Err(general_err!(
+                "Failed to parse timezone info for TIMESTAMP type"
+            ))
+        );
+
+        // Invalid timestamp because of unknown unit
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP(YOCTOS,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert_eq!(
+            result,
+            Err(general_err!("Failed to parse timeunit for TIMESTAMP type"))
+        );
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (DATE);
+      optional int32 f2 (TIME(MILLIS,true));
+      optional int64 f3 (TIME(MICROS,false));
+      optional int64 f4 (TIME(NANOS,true));
+      optional int64 f5 (TIMESTAMP(MILLIS,true));
+      optional int64 f6 (TIMESTAMP(MICROS,true));
+      optional int64 f7 (TIMESTAMP(NANOS,false));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());
+    }
+
     #[test]
     fn test_parse_message_type_decimal() {
         // It is okay for decimal to omit precision and scale with right syntax.
@@ -598,6 +930,10 @@ mod tests {
                         "f1",
                         PhysicalType::FIXED_LEN_BYTE_ARRAY,
                     )
+                    .with_logical_type(Some(LogicalType::DECIMAL(DecimalType {
+                        precision: 9,
+                        scale: 3,
+                    })))
                     .with_converted_type(ConvertedType::DECIMAL)
                     .with_length(5)
                     .with_precision(9)
@@ -610,6 +946,10 @@ mod tests {
                         "f2",
                         PhysicalType::FIXED_LEN_BYTE_ARRAY,
                     )
+                    .with_logical_type(Some(LogicalType::DECIMAL(DecimalType {
+                        precision: 38,
+                        scale: 18,
+                    })))
                     .with_converted_type(ConvertedType::DECIMAL)
                     .with_length(16)
                     .with_precision(38)
@@ -657,6 +997,9 @@ mod tests {
                         Arc::new(
                             Type::group_type_builder("a1")
                                 .with_repetition(Repetition::OPTIONAL)
+                                .with_logical_type(Some(LogicalType::LIST(
+                                    Default::default(),
+                                )))
                                 .with_converted_type(ConvertedType::LIST)
                                 .with_fields(&mut vec![Arc::new(
                                     Type::primitive_type_builder(
@@ -674,6 +1017,9 @@ mod tests {
                         Arc::new(
                             Type::group_type_builder("b1")
                                 .with_repetition(Repetition::OPTIONAL)
+                                .with_logical_type(Some(LogicalType::LIST(
+                                    Default::default(),
+                                )))
                                 .with_converted_type(ConvertedType::LIST)
                                 .with_fields(&mut vec![Arc::new(
                                     Type::group_type_builder("b2")
@@ -760,6 +1106,7 @@ mod tests {
             ),
             Arc::new(
                 Type::primitive_type_builder("_5", PhysicalType::INT32)
+                    .with_logical_type(Some(LogicalType::DATE(Default::default())))
                     .with_converted_type(ConvertedType::DATE)
                     .build()
                     .unwrap(),
@@ -778,4 +1125,117 @@ mod tests {
             .unwrap();
         assert_eq!(message, expected);
     }
+
+    #[test]
+    fn test_parse_message_type_compare_4() {
+        let schema = "
+    message root {
+      required int32 _1 (INTEGER(8,true));
+      required int32 _2 (INTEGER(16,false));
+      required float _3;
+      required double _4;
+      optional int32 _5 (DATE);
+      optional int32 _6 (TIME(MILLIS,false));
+      optional int64 _7 (TIME(MICROS,true));
+      optional int64 _8 (TIMESTAMP(MILLIS,true));
+      optional int64 _9 (TIMESTAMP(NANOS,false));
+      optional binary _10 (STRING);
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let message = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type()
+        .unwrap();
+
+        let mut fields = vec![
+            Arc::new(
+                Type::primitive_type_builder("_1", PhysicalType::INT32)
+                    .with_repetition(Repetition::REQUIRED)
+                    .with_logical_type(Some(LogicalType::INTEGER(IntType {
+                        bit_width: 8,
+                        is_signed: true,
+                    })))
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_2", PhysicalType::INT32)
+                    .with_repetition(Repetition::REQUIRED)
+                    .with_logical_type(Some(LogicalType::INTEGER(IntType {
+                        bit_width: 16,
+                        is_signed: false,
+                    })))
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_3", PhysicalType::FLOAT)
+                    .with_repetition(Repetition::REQUIRED)
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_4", PhysicalType::DOUBLE)
+                    .with_repetition(Repetition::REQUIRED)
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_5", PhysicalType::INT32)
+                    .with_logical_type(Some(LogicalType::DATE(Default::default())))
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_6", PhysicalType::INT32)
+                    .with_logical_type(Some(LogicalType::TIME(TimeType {
+                        unit: TimeUnit::MILLIS(Default::default()),
+                        is_adjusted_to_u_t_c: false,
+                    })))
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_7", PhysicalType::INT64)
+                    .with_logical_type(Some(LogicalType::TIME(TimeType {
+                        unit: TimeUnit::MICROS(Default::default()),
+                        is_adjusted_to_u_t_c: true,
+                    })))
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_8", PhysicalType::INT64)
+                    .with_logical_type(Some(LogicalType::TIMESTAMP(TimestampType {
+                        unit: TimeUnit::MILLIS(Default::default()),
+                        is_adjusted_to_u_t_c: true,
+                    })))
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_9", PhysicalType::INT64)
+                    .with_logical_type(Some(LogicalType::TIMESTAMP(TimestampType {
+                        unit: TimeUnit::NANOS(Default::default()),
+                        is_adjusted_to_u_t_c: false,
+                    })))
+                    .build()
+                    .unwrap(),
+            ),
+            Arc::new(
+                Type::primitive_type_builder("_10", PhysicalType::BYTE_ARRAY)
+                    .with_logical_type(Some(LogicalType::STRING(Default::default())))
+                    .build()
+                    .unwrap(),
+            ),
+        ];
+
+        let expected = Type::group_type_builder("root")
+            .with_fields(&mut fields)
+            .build()
+            .unwrap();
+        assert_eq!(message, expected);
+    }
 }
diff --git a/rust/parquet/src/schema/printer.rs b/rust/parquet/src/schema/printer.rs
index 81ada8f6f99..b1e739f77b4 100644
--- a/rust/parquet/src/schema/printer.rs
+++ b/rust/parquet/src/schema/printer.rs
@@ -45,7 +45,7 @@
 
 use std::{fmt, io};
 
-use crate::basic::{ConvertedType, Type as PhysicalType};
+use crate::basic::{ConvertedType, LogicalType, TimeUnit, Type as PhysicalType};
 use crate::file::metadata::{
     ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData,
 };
@@ -54,7 +54,7 @@ use crate::schema::types::Type;
 /// Prints Parquet metadata [`ParquetMetaData`](crate::file::metadata::ParquetMetaData)
 /// information.
 #[allow(unused_must_use)]
-pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) {
+pub fn print_parquet_metadata(out: &mut dyn io::Write, metadata: &ParquetMetaData) {
     print_file_metadata(out, &metadata.file_metadata());
     writeln!(out);
     writeln!(out);
@@ -71,7 +71,7 @@ pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) {
 /// Prints file metadata [`FileMetaData`](crate::file::metadata::FileMetaData)
 /// information.
 #[allow(unused_must_use)]
-pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) {
+pub fn print_file_metadata(out: &mut dyn io::Write, file_metadata: &FileMetaData) {
     writeln!(out, "version: {}", file_metadata.version());
     writeln!(out, "num of rows: {}", file_metadata.num_rows());
     if let Some(created_by) = file_metadata.created_by().as_ref() {
@@ -94,7 +94,7 @@ pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) {
 
 /// Prints Parquet [`Type`](crate::schema::types::Type) information.
 #[allow(unused_must_use)]
-pub fn print_schema(out: &mut io::Write, tp: &Type) {
+pub fn print_schema(out: &mut dyn io::Write, tp: &Type) {
     // TODO: better if we can pass fmt::Write to Printer.
     // But how can we make it to accept both io::Write & fmt::Write?
     let mut s = String::new();
@@ -106,7 +106,7 @@ pub fn print_schema(out: &mut io::Write, tp: &Type) {
 }
 
 #[allow(unused_must_use)]
-fn print_row_group_metadata(out: &mut io::Write, rg_metadata: &RowGroupMetaData) {
+fn print_row_group_metadata(out: &mut dyn io::Write, rg_metadata: &RowGroupMetaData) {
     writeln!(out, "total byte size: {}", rg_metadata.total_byte_size());
     writeln!(out, "num of rows: {}", rg_metadata.num_rows());
     writeln!(out);
@@ -121,7 +121,10 @@ fn print_row_group_metadata(out: &mut io::Write, rg_metadata: &RowGroupMetaData)
 }
 
 #[allow(unused_must_use)]
-fn print_column_chunk_metadata(out: &mut io::Write, cc_metadata: &ColumnChunkMetaData) {
+fn print_column_chunk_metadata(
+    out: &mut dyn io::Write,
+    cc_metadata: &ColumnChunkMetaData,
+) {
     writeln!(out, "column type: {}", cc_metadata.column_type());
     writeln!(out, "column path: {}", cc_metadata.column_path());
     let encoding_strs: Vec<_> = cc_metadata
@@ -167,7 +170,7 @@ fn print_column_chunk_metadata(out: &mut io::Write, cc_metadata: &ColumnChunkMet
 }
 
 #[allow(unused_must_use)]
-fn print_dashes(out: &mut io::Write, num: i32) {
+fn print_dashes(out: &mut dyn io::Write, num: i32) {
     for _ in 0..num {
         write!(out, "-");
     }
@@ -178,13 +181,13 @@ const INDENT_WIDTH: i32 = 2;
 
 /// Struct for printing Parquet message type.
 struct Printer<'a> {
-    output: &'a mut fmt::Write,
+    output: &'a mut dyn fmt::Write,
     indent: i32,
 }
 
 #[allow(unused_must_use)]
 impl<'a> Printer<'a> {
-    fn new(output: &'a mut fmt::Write) -> Self {
+    fn new(output: &'a mut dyn fmt::Write) -> Self {
         Printer { output, indent: 0 }
     }
 
@@ -195,6 +198,79 @@ impl<'a> Printer<'a> {
     }
 }
 
+#[inline]
+fn print_timeunit(unit: &TimeUnit) -> &str {
+    match unit {
+        TimeUnit::MILLIS(_) => "MILLIS",
+        TimeUnit::MICROS(_) => "MICROS",
+        TimeUnit::NANOS(_) => "NANOS",
+    }
+}
+
+#[inline]
+fn print_logical_and_converted(
+    logical_type: &Option<LogicalType>,
+    converted_type: ConvertedType,
+    precision: i32,
+    scale: i32,
+) -> String {
+    match logical_type {
+        Some(logical_type) => match logical_type {
+            LogicalType::INTEGER(t) => {
+                format!("INTEGER({},{})", t.bit_width, t.is_signed)
+            }
+            LogicalType::DECIMAL(t) => {
+                format!("DECIMAL({},{})", t.precision, t.scale)
+            }
+            LogicalType::TIMESTAMP(t) => {
+                format!(
+                    "TIMESTAMP({},{})",
+                    print_timeunit(&t.unit),
+                    t.is_adjusted_to_u_t_c
+                )
+            }
+            LogicalType::TIME(t) => {
+                format!(
+                    "TIME({},{})",
+                    print_timeunit(&t.unit),
+                    t.is_adjusted_to_u_t_c
+                )
+            }
+            LogicalType::DATE(_) => "DATE".to_string(),
+            LogicalType::BSON(_) => "BSON".to_string(),
+            LogicalType::JSON(_) => "JSON".to_string(),
+            LogicalType::STRING(_) => "STRING".to_string(),
+            LogicalType::UUID(_) => "UUID".to_string(),
+            LogicalType::ENUM(_) => "ENUM".to_string(),
+            LogicalType::LIST(_) => "LIST".to_string(),
+            LogicalType::MAP(_) => "MAP".to_string(),
+            LogicalType::UNKNOWN(_) => "UNKNOWN".to_string(),
+        },
+        None => {
+            // Also print converted type if it is available
+            match converted_type {
+                ConvertedType::NONE => format!(""),
+                decimal @ ConvertedType::DECIMAL => {
+                    // For decimal type we should print precision and scale if they
+                    // are > 0, e.g. DECIMAL(9, 2) -
+                    // DECIMAL(9) - DECIMAL
+                    let precision_scale = match (precision, scale) {
+                        (p, s) if p > 0 && s > 0 => {
+                            format!("{}, {}", p, s)
+                        }
+                        (p, 0) if p > 0 => format!("{}", p),
+                        _ => format!(""),
+                    };
+                    format!("{}{}", decimal, precision_scale)
+                }
+                other_converted_type => {
+                    format!("{}", other_converted_type)
+                }
+            }
+        }
+    }
+}
+
 #[allow(unused_must_use)]
 impl<'a> Printer<'a> {
     pub fn print(&mut self, tp: &Type) {
@@ -215,29 +291,31 @@ impl<'a> Printer<'a> {
                     _ => format!("{}", physical_type),
                 };
                 // Also print logical type if it is available
-                let converted_type_str = match basic_info.converted_type() {
-                    ConvertedType::NONE => format!(""),
-                    decimal @ ConvertedType::DECIMAL => {
-                        // For decimal type we should print precision and scale if they
-                        // are > 0, e.g. DECIMAL(9, 2) -
-                        // DECIMAL(9) - DECIMAL
-                        let precision_scale = match (precision, scale) {
-                            (p, s) if p > 0 && s > 0 => format!(" ({}, {})", p, s),
-                            (p, 0) if p > 0 => format!(" ({})", p),
-                            _ => format!(""),
-                        };
-                        format!(" ({}{})", decimal, precision_scale)
-                    }
-                    other_converted_type => format!(" ({})", other_converted_type),
-                };
-                write!(
-                    self.output,
-                    "{} {} {}{};",
-                    basic_info.repetition(),
-                    phys_type_str,
-                    basic_info.name(),
-                    converted_type_str
+                // If there is a logical type, do not print converted type
+                let logical_type_str = print_logical_and_converted(
+                    &basic_info.logical_type(),
+                    basic_info.converted_type(),
+                    scale,
+                    precision,
                 );
+                if logical_type_str.is_empty() {
+                    write!(
+                        self.output,
+                        "{} {} {};",
+                        basic_info.repetition(),
+                        phys_type_str,
+                        basic_info.name()
+                    );
+                } else {
+                    write!(
+                        self.output,
+                        "{} {} {} ({});",
+                        basic_info.repetition(),
+                        phys_type_str,
+                        basic_info.name(),
+                        logical_type_str
+                    );
+                }
             }
             Type::GroupType {
                 ref basic_info,
@@ -246,8 +324,14 @@ impl<'a> Printer<'a> {
                 if basic_info.has_repetition() {
                     let r = basic_info.repetition();
                     write!(self.output, "{} group {} ", r, basic_info.name());
-                    if basic_info.converted_type() != ConvertedType::NONE {
-                        write!(self.output, "({}) ", basic_info.converted_type());
+                    let logical_str = print_logical_and_converted(
+                        &basic_info.logical_type(),
+                        basic_info.converted_type(),
+                        0,
+                        0,
+                    );
+                    if !logical_str.is_empty() {
+                        write!(self.output, "({}) ", logical_str);
                     }
                     writeln!(self.output, "{{");
                 } else {
@@ -273,7 +357,11 @@ mod tests {
 
     use std::sync::Arc;
 
-    use crate::basic::{Repetition, Type as PhysicalType};
+    use crate::basic::{
+        DateType, DecimalType, IntType, LogicalType, Repetition, TimeType, TimestampType,
+        Type as PhysicalType,
+    };
+    use crate::errors::Result;
     use crate::schema::{parser::parse_message_type, types::Type};
 
     fn assert_print_parse_message(message: Type) {
@@ -282,6 +370,7 @@ mod tests {
             let mut p = Printer::new(&mut s);
             p.print(&message);
         }
+        println!("{}", &s);
         let parsed = parse_message_type(&s).unwrap();
         assert_eq!(message, parsed);
     }
@@ -301,18 +390,259 @@ mod tests {
         assert_eq!(&mut s, "REQUIRED INT32 field (INT_32);");
     }
 
+    #[inline]
+    fn build_primitive_type(
+        name: &str,
+        physical_type: PhysicalType,
+        logical_type: Option<LogicalType>,
+        converted_type: ConvertedType,
+        repetition: Repetition,
+    ) -> Result<Type> {
+        Type::primitive_type_builder(name, physical_type)
+            .with_repetition(repetition)
+            .with_logical_type(logical_type)
+            .with_converted_type(converted_type)
+            .build()
+    }
+
     #[test]
-    fn test_print_primitive_type_without_logical() {
-        let mut s = String::new();
-        {
-            let mut p = Printer::new(&mut s);
-            let field = Type::primitive_type_builder("field", PhysicalType::DOUBLE)
-                .with_repetition(Repetition::REQUIRED)
+    fn test_print_logical_types() {
+        let types_and_strings = vec![
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::INT32,
+                    Some(LogicalType::INTEGER(IntType {
+                        bit_width: 32,
+                        is_signed: true,
+                    })),
+                    ConvertedType::NONE,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED INT32 field (INTEGER(32,true));",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::INT32,
+                    Some(LogicalType::INTEGER(IntType {
+                        bit_width: 8,
+                        is_signed: false,
+                    })),
+                    ConvertedType::NONE,
+                    Repetition::OPTIONAL,
+                )
+                .unwrap(),
+                "OPTIONAL INT32 field (INTEGER(8,false));",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::INT32,
+                    Some(LogicalType::INTEGER(IntType {
+                        bit_width: 16,
+                        is_signed: true,
+                    })),
+                    ConvertedType::INT_16,
+                    Repetition::REPEATED,
+                )
+                .unwrap(),
+                "REPEATED INT32 field (INTEGER(16,true));",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::INT64,
+                    None,
+                    ConvertedType::NONE,
+                    Repetition::REPEATED,
+                )
+                .unwrap(),
+                "REPEATED INT64 field;",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::FLOAT,
+                    None,
+                    ConvertedType::NONE,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED FLOAT field;",
+            ),
+            (
+                build_primitive_type(
+                    "booleans",
+                    PhysicalType::BOOLEAN,
+                    None,
+                    ConvertedType::NONE,
+                    Repetition::OPTIONAL,
+                )
+                .unwrap(),
+                "OPTIONAL BOOLEAN booleans;",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::INT64,
+                    Some(LogicalType::TIMESTAMP(TimestampType {
+                        is_adjusted_to_u_t_c: true,
+                        unit: TimeUnit::MILLIS(Default::default()),
+                    })),
+                    ConvertedType::NONE,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED INT64 field (TIMESTAMP(MILLIS,true));",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::INT32,
+                    Some(LogicalType::DATE(DateType {})),
+                    ConvertedType::NONE,
+                    Repetition::OPTIONAL,
+                )
+                .unwrap(),
+                "OPTIONAL INT32 field (DATE);",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::INT32,
+                    Some(LogicalType::TIME(TimeType {
+                        unit: TimeUnit::MILLIS(Default::default()),
+                        is_adjusted_to_u_t_c: false,
+                    })),
+                    ConvertedType::TIME_MILLIS,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED INT32 field (TIME(MILLIS,false));",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::BYTE_ARRAY,
+                    None,
+                    ConvertedType::NONE,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED BYTE_ARRAY field;",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::BYTE_ARRAY,
+                    None,
+                    ConvertedType::UTF8,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED BYTE_ARRAY field (UTF8);",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::BYTE_ARRAY,
+                    Some(LogicalType::JSON(Default::default())),
+                    ConvertedType::JSON,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED BYTE_ARRAY field (JSON);",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::BYTE_ARRAY,
+                    Some(LogicalType::BSON(Default::default())),
+                    ConvertedType::BSON,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED BYTE_ARRAY field (BSON);",
+            ),
+            (
+                build_primitive_type(
+                    "field",
+                    PhysicalType::BYTE_ARRAY,
+                    Some(LogicalType::STRING(Default::default())),
+                    ConvertedType::NONE,
+                    Repetition::REQUIRED,
+                )
+                .unwrap(),
+                "REQUIRED BYTE_ARRAY field (STRING);",
+            ),
+        ];
+
+        types_and_strings.into_iter().for_each(|(field, expected)| {
+            let mut s = String::new();
+            {
+                let mut p = Printer::new(&mut s);
+                p.print(&field);
+            }
+            assert_eq!(&s, expected)
+        });
+    }
+
+    #[inline]
+    fn decimal_length_from_precision(precision: usize) -> i32 {
+        (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32
+    }
+
+    #[test]
+    fn test_print_flba_logical_types() {
+        let types_and_strings = vec![
+            (
+                Type::primitive_type_builder("field", PhysicalType::FIXED_LEN_BYTE_ARRAY)
+                    .with_logical_type(None)
+                    .with_converted_type(ConvertedType::INTERVAL)
+                    .with_length(12)
+                    .with_repetition(Repetition::REQUIRED)
+                    .build()
+                    .unwrap(),
+                "REQUIRED FIXED_LEN_BYTE_ARRAY (12) field (INTERVAL);",
+            ),
+            (
+                Type::primitive_type_builder("field", PhysicalType::FIXED_LEN_BYTE_ARRAY)
+                    .with_logical_type(Some(LogicalType::UUID(Default::default())))
+                    .with_length(16)
+                    .with_repetition(Repetition::REQUIRED)
+                    .build()
+                    .unwrap(),
+                "REQUIRED FIXED_LEN_BYTE_ARRAY (16) field (UUID);",
+            ),
+            (
+                Type::primitive_type_builder(
+                    "decimal",
+                    PhysicalType::FIXED_LEN_BYTE_ARRAY,
+                )
+                .with_logical_type(Some(LogicalType::DECIMAL(DecimalType {
+                    precision: 32,
+                    scale: 20,
+                })))
+                .with_precision(32)
+                .with_scale(20)
+                .with_length(decimal_length_from_precision(32))
+                .with_repetition(Repetition::REPEATED)
                 .build()
-                .unwrap();
-            p.print(&field);
-        }
-        assert_eq!(&mut s, "REQUIRED DOUBLE field;");
+                .unwrap(),
+                "REPEATED FIXED_LEN_BYTE_ARRAY (14) decimal (DECIMAL(32,20));",
+            ),
+        ];
+
+        types_and_strings.into_iter().for_each(|(field, expected)| {
+            let mut s = String::new();
+            {
+                let mut p = Printer::new(&mut s);
+                p.print(&field);
+            }
+            assert_eq!(&s, expected)
+        });
     }
 
     #[test]
@@ -329,8 +659,12 @@ mod tests {
                 .with_converted_type(ConvertedType::UTF8)
                 .with_id(1)
                 .build();
-            let f3 =
-                Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY)
+            let f3 = Type::primitive_type_builder("f3", PhysicalType::BYTE_ARRAY)
+                .with_logical_type(Some(LogicalType::STRING(Default::default())))
+                .with_id(1)
+                .build();
+            let f4 =
+                Type::primitive_type_builder("f4", PhysicalType::FIXED_LEN_BYTE_ARRAY)
                     .with_repetition(Repetition::REPEATED)
                     .with_converted_type(ConvertedType::INTERVAL)
                     .with_length(12)
@@ -339,6 +673,7 @@ mod tests {
             let mut struct_fields = Vec::new();
             struct_fields.push(Arc::new(f1.unwrap()));
             struct_fields.push(Arc::new(f2.unwrap()));
+            struct_fields.push(Arc::new(f3.unwrap()));
             let field = Type::group_type_builder("field")
                 .with_repetition(Repetition::OPTIONAL)
                 .with_fields(&mut struct_fields)
@@ -347,7 +682,7 @@ mod tests {
                 .unwrap();
             let mut fields = Vec::new();
             fields.push(Arc::new(field));
-            fields.push(Arc::new(f3.unwrap()));
+            fields.push(Arc::new(f4.unwrap()));
             let message = Type::group_type_builder("schema")
                 .with_fields(&mut fields)
                 .with_id(2)
@@ -359,8 +694,9 @@ mod tests {
   OPTIONAL group field {
     REQUIRED INT32 f1 (INT_32);
     OPTIONAL BYTE_ARRAY f2 (UTF8);
+    OPTIONAL BYTE_ARRAY f3 (STRING);
   }
-  REPEATED FIXED_LEN_BYTE_ARRAY (12) f3 (INTERVAL);
+  REPEATED FIXED_LEN_BYTE_ARRAY (12) f4 (INTERVAL);
 }";
         assert_eq!(&mut s, expected);
     }
@@ -375,6 +711,7 @@ mod tests {
 
         let a1 = Type::group_type_builder("a1")
             .with_repetition(Repetition::OPTIONAL)
+            .with_logical_type(Some(LogicalType::LIST(Default::default())))
             .with_converted_type(ConvertedType::LIST)
             .with_fields(&mut vec![Arc::new(a2)])
             .build()
@@ -399,6 +736,7 @@ mod tests {
 
         let b1 = Type::group_type_builder("b1")
             .with_repetition(Repetition::OPTIONAL)
+            .with_logical_type(Some(LogicalType::LIST(Default::default())))
             .with_converted_type(ConvertedType::LIST)
             .with_fields(&mut vec![Arc::new(b2)])
             .build()
@@ -457,6 +795,10 @@ mod tests {
     fn test_print_and_parse_decimal() {
         let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
             .with_repetition(Repetition::OPTIONAL)
+            .with_logical_type(Some(LogicalType::DECIMAL(DecimalType {
+                precision: 9,
+                scale: 2,
+            })))
             .with_converted_type(ConvertedType::DECIMAL)
             .with_precision(9)
             .with_scale(2)
@@ -465,6 +807,10 @@ mod tests {
 
         let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
             .with_repetition(Repetition::OPTIONAL)
+            .with_logical_type(Some(LogicalType::DECIMAL(DecimalType {
+                precision: 9,
+                scale: 0,
+            })))
             .with_converted_type(ConvertedType::DECIMAL)
             .with_precision(9)
             .with_scale(0)
diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs
index af1d632dde7..03b2500a3cd 100644
--- a/rust/parquet/src/schema/types.rs
+++ b/rust/parquet/src/schema/types.rs
@@ -711,8 +711,7 @@ impl<'a> From<&'a str> for ColumnPath {
 
 impl From<String> for ColumnPath {
     fn from(single_path: String) -> Self {
-        let mut v = vec![];
-        v.push(single_path);
+        let v = vec![single_path];
         ColumnPath { parts: v }
     }
 }
diff --git a/rust/parquet/src/util/test_common/page_util.rs b/rust/parquet/src/util/test_common/page_util.rs
index e360f3da52a..2e0e8e926bc 100644
--- a/rust/parquet/src/util/test_common/page_util.rs
+++ b/rust/parquet/src/util/test_common/page_util.rs
@@ -119,7 +119,7 @@ impl DataPageBuilder for DataPageBuilderImpl {
             values.len()
         );
         self.encoding = Some(encoding);
-        let mut encoder: Box<Encoder<T>> =
+        let mut encoder: Box<dyn Encoder<T>> =
             get_encoder::<T>(self.desc.clone(), encoding, self.mem_tracker.clone())
                 .expect("get_encoder() should be OK");
         encoder.put(values).expect("put() should be OK");
@@ -164,7 +164,7 @@ impl DataPageBuilder for DataPageBuilderImpl {
 
 /// A utility page reader which stores pages in memory.
 pub struct InMemoryPageReader {
-    pages: Box<Iterator<Item = Page>>,
+    pages: Box<dyn Iterator<Item = Page>>,
 }
 
 impl InMemoryPageReader {
diff --git a/testing b/testing
index e8ce32338f2..b658b087767 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit e8ce32338f2dfeca3a5126f7677bdee159604000
+Subproject commit b658b087767b041b2081766814655b4dd5a9a439